Skip to content

Commit

Permalink
AMDGPU/SI: Turn off GPR Indexing Mode immediately after the intereste…
Browse files Browse the repository at this point in the history
…d instruction.

Summary:
  In the current implementation of GPR Indexing Mode when the index is of non-uniform, the s_set_gpr_idx_off instruction
is incorrectly inserted after the loop. This will lead the instructions with vgpr operands (v_readfirstlane for example) to read incorrect
vgpr.
 In this patch, we fix the issue by inserting s_set_gpr_idx_on/off immediately around the interested instruction.

Reviewers:
  rampitec

Differential Revision:
  https://reviews.llvm.org/D43297

llvm-svn: 325355
  • Loading branch information
Changpeng Fang committed Feb 16, 2018
1 parent ff53a4a commit da38b5f
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 77 deletions.
61 changes: 23 additions & 38 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Expand Up @@ -2744,7 +2744,8 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
unsigned PhiReg,
unsigned InitSaveExecReg,
int Offset,
bool UseGPRIdxMode) {
bool UseGPRIdxMode,
bool IsIndirectSrc) {
MachineBasicBlock::iterator I = LoopBB.begin();

unsigned PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
Expand Down Expand Up @@ -2773,6 +2774,12 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
.addReg(CurrentIdxReg)
.addReg(IdxReg.getReg(), 0, IdxReg.getSubReg());

// Update EXEC, save the original EXEC value to VCC.
BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), NewExec)
.addReg(CondReg, RegState::Kill);

MRI.setSimpleHint(NewExec, CondReg);

if (UseGPRIdxMode) {
unsigned IdxReg;
if (Offset == 0) {
Expand All @@ -2783,11 +2790,13 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
.addReg(CurrentIdxReg, RegState::Kill)
.addImm(Offset);
}

MachineInstr *SetIdx =
BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_IDX))
.addReg(IdxReg, RegState::Kill);
SetIdx->getOperand(2).setIsUndef();
unsigned IdxMode = IsIndirectSrc ?
VGPRIndexMode::SRC0_ENABLE : VGPRIndexMode::DST_ENABLE;
MachineInstr *SetOn =
BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
.addReg(IdxReg, RegState::Kill)
.addImm(IdxMode);
SetOn->getOperand(3).setIsUndef();
} else {
// Move index from VCC into M0
if (Offset == 0) {
Expand All @@ -2800,12 +2809,6 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
}
}

// Update EXEC, save the original EXEC value to VCC.
BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), NewExec)
.addReg(CondReg, RegState::Kill);

MRI.setSimpleHint(NewExec, CondReg);

// Update EXEC, switch all done bits to 0 and all todo bits to 1.
MachineInstr *InsertPt =
BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
Expand Down Expand Up @@ -2833,7 +2836,8 @@ static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII,
unsigned InitResultReg,
unsigned PhiReg,
int Offset,
bool UseGPRIdxMode) {
bool UseGPRIdxMode,
bool IsIndirectSrc) {
MachineFunction *MF = MBB.getParent();
MachineRegisterInfo &MRI = MF->getRegInfo();
const DebugLoc &DL = MI.getDebugLoc();
Expand Down Expand Up @@ -2872,7 +2876,7 @@ static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII,

auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
InitResultReg, DstReg, PhiReg, TmpExec,
Offset, UseGPRIdxMode);
Offset, UseGPRIdxMode, IsIndirectSrc);

MachineBasicBlock::iterator First = RemainderBB->begin();
BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
Expand Down Expand Up @@ -3007,24 +3011,16 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,

BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);

if (UseGPRIdxMode) {
MachineInstr *SetOn = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
.addImm(0) // Reset inside loop.
.addImm(VGPRIndexMode::SRC0_ENABLE);
SetOn->getOperand(3).setIsUndef();

// Disable again after the loop.
BuildMI(MBB, std::next(I), DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
}

auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset, UseGPRIdxMode);
auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg,
Offset, UseGPRIdxMode, true);
MachineBasicBlock *LoopBB = InsPt->getParent();

if (UseGPRIdxMode) {
BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
.addReg(SrcReg, RegState::Undef, SubReg)
.addReg(SrcReg, RegState::Implicit)
.addReg(AMDGPU::M0, RegState::Implicit);
BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
} else {
BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
.addReg(SrcReg, RegState::Undef, SubReg)
Expand Down Expand Up @@ -3125,22 +3121,10 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,

const DebugLoc &DL = MI.getDebugLoc();

if (UseGPRIdxMode) {
MachineBasicBlock::iterator I(&MI);

MachineInstr *SetOn = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
.addImm(0) // Reset inside loop.
.addImm(VGPRIndexMode::DST_ENABLE);
SetOn->getOperand(3).setIsUndef();

// Disable again after the loop.
BuildMI(MBB, std::next(I), DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
}

unsigned PhiReg = MRI.createVirtualRegister(VecRC);

auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg,
Offset, UseGPRIdxMode);
Offset, UseGPRIdxMode, false);
MachineBasicBlock *LoopBB = InsPt->getParent();

if (UseGPRIdxMode) {
Expand All @@ -3150,6 +3134,7 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
.addReg(Dst, RegState::ImplicitDefine)
.addReg(PhiReg, RegState::Implicit)
.addReg(AMDGPU::M0, RegState::Implicit);
BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
} else {
const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC));

Expand Down
62 changes: 23 additions & 39 deletions llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
Expand Up @@ -118,23 +118,21 @@ entry:
; The offset depends on the register that holds the first element of the vector.

; FIXME: The waitcnt for the argument load can go after the loop
; IDXMODE: s_set_gpr_idx_on 0, src0
; GCN: s_mov_b64 s{{\[[0-9]+:[0-9]+\]}}, exec
; GCN: [[LOOPBB:BB[0-9]+_[0-9]+]]:
; GCN: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v{{[0-9]+}}
; GCN: s_and_saveexec_b64 vcc, vcc

; MOVREL: s_add_i32 m0, [[READLANE]], 0xfffffe0
; MOVREL: s_and_saveexec_b64 vcc, vcc
; MOVREL: v_movrels_b32_e32 [[RESULT:v[0-9]+]], v1

; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00
; IDXMODE: s_set_gpr_idx_idx [[ADD_IDX]]
; IDXMODE: s_and_saveexec_b64 vcc, vcc
; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], src0
; IDXMODE: v_mov_b32_e32 [[RESULT:v[0-9]+]], v1
; IDXMODE: s_set_gpr_idx_off

; GCN: s_cbranch_execnz

; IDXMODE: s_set_gpr_idx_off
; GCN: buffer_store_dword [[RESULT]]
define amdgpu_kernel void @extract_neg_offset_vgpr(i32 addrspace(1)* %out) {
entry:
Expand Down Expand Up @@ -250,21 +248,19 @@ entry:
; GCN: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec
; GCN: [[LOOPBB:BB[0-9]+_[0-9]+]]:
; GCN: v_readfirstlane_b32 [[READLANE:s[0-9]+]]
; GCN: s_and_saveexec_b64 vcc, vcc

; MOVREL: s_add_i32 m0, [[READLANE]], 0xfffffe00
; MOVREL: s_and_saveexec_b64 vcc, vcc
; MOVREL: v_movreld_b32_e32 [[VEC_ELT0]], 5

; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}}
; IDXMODE: s_set_gpr_idx_idx [[ADD_IDX]]
; IDXMODE: s_and_saveexec_b64 vcc, vcc
; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], dst
; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, 5
; IDXMODE: s_set_gpr_idx_off

; GCN: s_cbranch_execnz [[LOOPBB]]
; GCN: s_mov_b64 exec, [[SAVEEXEC]]

; IDXMODE: s_set_gpr_idx_off

; GCN: buffer_store_dword
define amdgpu_kernel void @insert_neg_offset_vgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
entry:
Expand All @@ -283,8 +279,6 @@ entry:
; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 4{{$}}
; GCN-DAG: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x1f4{{$}}

; IDXMODE: s_set_gpr_idx_on 0, dst

; GCN: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec

; The offset depends on the register that holds the first element of the vector.
Expand All @@ -294,12 +288,11 @@ entry:
; MOVREL: v_movreld_b32_e32 [[VEC_ELT0]], [[VAL]]

; IDXMODE: s_add_i32 [[ADD_IDX:s[0-9]+]], [[READLANE]], -16
; IDXMODE: s_set_gpr_idx_idx [[ADD_IDX]]
; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], dst
; IDXMODE: v_mov_b32_e32 [[VEC_ELT0]], [[VAL]]
; IDXMODE: s_set_gpr_idx_off

; GCN: s_cbranch_execnz

; IDXMODE: s_set_gpr_idx_off
define amdgpu_kernel void @insert_neg_inline_offset_vgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
entry:
%id = call i32 @llvm.amdgcn.workitem.id.x() #1
Expand All @@ -322,52 +315,46 @@ entry:
; GCN-DAG: v_mov_b32_e32 [[VEC_ELT0:v[0-9]+]], [[S_ELT0]]
; GCN-DAG: v_mov_b32_e32 [[VEC_ELT1:v[0-9]+]], [[S_ELT1]]

; IDXMODE: s_set_gpr_idx_on 0, src0

; GCN: s_mov_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], exec

; GCN: [[LOOP0:BB[0-9]+_[0-9]+]]:
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]]
; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]]
; GCN: s_and_saveexec_b64 vcc, vcc

; MOVREL: s_mov_b32 m0, [[READLANE]]
; MOVREL: s_and_saveexec_b64 vcc, vcc
; MOVREL: v_movrels_b32_e32 [[MOVREL0:v[0-9]+]], [[VEC_ELT0]]

; IDXMODE: s_set_gpr_idx_idx [[READLANE]]
; IDXMODE: s_and_saveexec_b64 vcc, vcc
; IDXMODE: s_set_gpr_idx_on [[READLANE]], src0
; IDXMODE: v_mov_b32_e32 [[MOVREL0:v[0-9]+]], [[VEC_ELT0]]
; IDXMODE: s_set_gpr_idx_off

; GCN-NEXT: s_xor_b64 exec, exec, vcc
; GCN-NEXT: s_cbranch_execnz [[LOOP0]]

; FIXME: Redundant copy
; GCN: s_mov_b64 exec, [[MASK]]
; IDXMODE: s_set_gpr_idx_off

; GCN: v_mov_b32_e32 [[VEC_ELT1_2:v[0-9]+]], [[S_ELT1]]

; IDXMODE: s_set_gpr_idx_on 0, src0
; GCN: s_mov_b64 [[MASK2:s\[[0-9]+:[0-9]+\]]], exec

; GCN: [[LOOP1:BB[0-9]+_[0-9]+]]:
; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]]
; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]]
; GCN: s_and_saveexec_b64 vcc, vcc

; MOVREL: s_mov_b32 m0, [[READLANE]]
; MOVREL: s_and_saveexec_b64 vcc, vcc
; MOVREL-NEXT: v_movrels_b32_e32 [[MOVREL1:v[0-9]+]], [[VEC_ELT1_2]]

; IDXMODE: s_set_gpr_idx_idx [[READLANE]]
; IDXMODE: s_and_saveexec_b64 vcc, vcc
; IDXMODE: s_set_gpr_idx_on [[READLANE]], src0
; IDXMODE-NEXT: v_mov_b32_e32 [[MOVREL1:v[0-9]+]], [[VEC_ELT1_2]]
; IDXMODE: s_set_gpr_idx_off

; GCN-NEXT: s_xor_b64 exec, exec, vcc
; GCN: s_cbranch_execnz [[LOOP1]]

; IDXMODE: s_set_gpr_idx_off

; GCN: buffer_store_dword [[MOVREL0]]
; GCN: buffer_store_dword [[MOVREL1]]
define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %in) #0 {
Expand Down Expand Up @@ -403,42 +390,38 @@ bb2:
; GCN: v_mov_b32_e32 v[[VEC_ELT1:[0-9]+]], s{{[0-9]+}}
; GCN: v_mov_b32_e32 v[[VEC_ELT0:[0-9]+]], s[[S_ELT0]]

; IDXMODE: s_set_gpr_idx_on 0, dst

; GCN: [[LOOP0:BB[0-9]+_[0-9]+]]:
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]]
; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]]
; GCN: s_and_saveexec_b64 vcc, vcc

; MOVREL: s_mov_b32 m0, [[READLANE]]
; MOVREL: s_and_saveexec_b64 vcc, vcc
; MOVREL-NEXT: v_movreld_b32_e32 v[[VEC_ELT0]], [[INS0]]

; IDXMODE: s_set_gpr_idx_idx [[READLANE]]
; IDXMODE: s_and_saveexec_b64 vcc, vcc
; IDXMODE: s_set_gpr_idx_on [[READLANE]], dst
; IDXMODE-NEXT: v_mov_b32_e32 v[[VEC_ELT0]], [[INS0]]
; IDXMODE: s_set_gpr_idx_off

; GCN-NEXT: s_xor_b64 exec, exec, vcc
; GCN: s_cbranch_execnz [[LOOP0]]

; FIXME: Redundant copy
; GCN: s_mov_b64 exec, [[MASK:s\[[0-9]+:[0-9]+\]]]
; IDXMODE: s_set_gpr_idx_off

; IDXMODE: s_set_gpr_idx_on 0, dst
; GCN: s_mov_b64 [[MASK]], exec

; GCN: [[LOOP1:BB[0-9]+_[0-9]+]]:
; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]]
; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]]
; GCN: s_and_saveexec_b64 vcc, vcc

; MOVREL: s_mov_b32 m0, [[READLANE]]
; MOVREL: s_and_saveexec_b64 vcc, vcc
; MOVREL-NEXT: v_movreld_b32_e32 v[[VEC_ELT1]], 63

; IDXMODE: s_set_gpr_idx_idx [[READLANE]]
; IDXMODE: s_and_saveexec_b64 vcc, vcc
; IDXMODE: s_set_gpr_idx_on [[READLANE]], dst
; IDXMODE-NEXT: v_mov_b32_e32 v[[VEC_ELT1]], 63
; IDXMODE: s_set_gpr_idx_off

; GCN-NEXT: s_xor_b64 exec, exec, vcc
; GCN: s_cbranch_execnz [[LOOP1]]
Expand Down Expand Up @@ -639,7 +622,6 @@ define amdgpu_kernel void @insertelement_v4f32_or_index(<4 x float> addrspace(1)

; GCN: {{^BB[0-9]+_[0-9]+}}:
; GCN: s_mov_b64 exec,
; IDXMODE: s_set_gpr_idx_off

; GCN: [[BB2]]:
; GCN: v_cmp_le_i32_e32 vcc, s{{[0-9]+}}, [[PHIREG]]
Expand All @@ -648,8 +630,10 @@ define amdgpu_kernel void @insertelement_v4f32_or_index(<4 x float> addrspace(1)
; GCN: [[REGLOOP:BB[0-9]+_[0-9]+]]:
; MOVREL: v_movreld_b32_e32

; IDXMODE: s_set_gpr_idx_idx
; IDXMODE: s_set_gpr_idx_on
; IDXMODE: v_mov_b32_e32
; IDXMODE: s_set_gpr_idx_off

; GCN: s_cbranch_execnz [[REGLOOP]]
define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) #0 {
bb:
Expand Down

0 comments on commit da38b5f

Please sign in to comment.