Skip to content

Commit

Permalink
[AMDGPU] Enable compare operations to be selected by divergence
Browse files Browse the repository at this point in the history
Summary: Details: This patch enables SETCC to be selected to S_CMP_* if uniform and V_CMP_* if divergent.

Reviewers: rampitec, arsenm

Reviewed By: rampitec

Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, kerbowa, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D82194
  • Loading branch information
alex-t committed Jun 24, 2020
1 parent b769eb0 commit 521ac0b
Show file tree
Hide file tree
Showing 63 changed files with 2,163 additions and 1,681 deletions.
119 changes: 74 additions & 45 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Expand Up @@ -602,6 +602,12 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
}

if (RC == &AMDGPU::SReg_64RegClass) {
if (SrcReg == AMDGPU::SCC) {
BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg)
.addImm(1)
.addImm(0);
return;
}
if (DestReg == AMDGPU::VCC) {
if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
Expand Down Expand Up @@ -4088,20 +4094,20 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32;
case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32;
case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32;
case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32;
case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32;
case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32;
case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e32;
case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e32;
case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e32;
case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e32;
case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e32;
case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e32;
case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e32;
case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e32;
case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e64;
case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e64;
case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e64;
case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e64;
case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e64;
case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e64;
case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e64;
case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e64;
case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e64;
case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e64;
case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e64;
case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e64;
case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e64;
case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e64;
case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
Expand Down Expand Up @@ -4492,13 +4498,13 @@ void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
continue;
}

if (RI.hasAGPRs(MRI.getRegClass(MO.getReg())) &&
if (RI.hasAGPRs(RI.getRegClassForReg(MRI, MO.getReg())) &&
!isOperandLegal(MI, Idx, &MO)) {
legalizeOpWithMove(MI, Idx);
continue;
}

if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg())))
if (!RI.isSGPRClass(RI.getRegClassForReg(MRI, MO.getReg())))
continue; // VGPRs are legal

// We can use one SGPR in each VOP3 instruction prior to GFX10
Expand Down Expand Up @@ -5134,7 +5140,7 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst,

unsigned Opcode = Inst.getOpcode();
unsigned NewOpcode = getVALUOp(Inst);

Register CondReg = RI.getVCC();
// Handle some special cases
switch (Opcode) {
default:
Expand Down Expand Up @@ -5253,19 +5259,19 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst,
continue;

case AMDGPU::S_CBRANCH_SCC0:
case AMDGPU::S_CBRANCH_SCC1:
case AMDGPU::S_CBRANCH_SCC1: {
// Clear unused bits of vcc
if (ST.isWave32())
BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B32),
AMDGPU::VCC_LO)
.addReg(AMDGPU::EXEC_LO)
.addReg(AMDGPU::VCC_LO);
else
BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B64),
AMDGPU::VCC)
.addReg(AMDGPU::EXEC)
.addReg(AMDGPU::VCC);
Register CondReg = Inst.getOperand(1).getReg();
bool IsSCC = CondReg == AMDGPU::SCC;
Register VCC = RI.getVCC();
Register EXEC = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
unsigned Opc = ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(Opc), VCC)
.addReg(EXEC)
.addReg(IsSCC ? VCC : CondReg);
Inst.RemoveOperand(1);
break;
}

case AMDGPU::S_BFE_U64:
case AMDGPU::S_BFM_B64:
Expand Down Expand Up @@ -5366,6 +5372,33 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst,
Inst.eraseFromParent();
}
continue;
case AMDGPU::S_CMP_EQ_I32:
case AMDGPU::S_CMP_LG_I32:
case AMDGPU::S_CMP_GT_I32:
case AMDGPU::S_CMP_GE_I32:
case AMDGPU::S_CMP_LT_I32:
case AMDGPU::S_CMP_LE_I32:
case AMDGPU::S_CMP_EQ_U32:
case AMDGPU::S_CMP_LG_U32:
case AMDGPU::S_CMP_GT_U32:
case AMDGPU::S_CMP_GE_U32:
case AMDGPU::S_CMP_LT_U32:
case AMDGPU::S_CMP_LE_U32:
case AMDGPU::S_CMP_EQ_U64:
case AMDGPU::S_CMP_LG_U64: {
const MCInstrDesc &NewDesc = get(NewOpcode);
CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
MachineInstr *NewInstr =
BuildMI(*MBB, Inst, Inst.getDebugLoc(), NewDesc, CondReg)
.add(Inst.getOperand(0))
.add(Inst.getOperand(1));
legalizeOperands(*NewInstr, MDT);
int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC);
MachineOperand SCCOp = Inst.getOperand(SCCIdx);
addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
Inst.eraseFromParent();
continue;
}
}

if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
Expand All @@ -5387,7 +5420,7 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst,
if (Op.isReg() && Op.getReg() == AMDGPU::SCC) {
// Only propagate through live-def of SCC.
if (Op.isDef() && !Op.isDead())
addSCCDefUsersToVALUWorklist(Op, Inst, Worklist);
addSCCDefUsersToVALUWorklist(Op, Inst, Worklist, RI.getVCC());
Inst.RemoveOperand(i);
}
}
Expand Down Expand Up @@ -5801,9 +5834,9 @@ void SIInstrInfo::splitScalar64BitBinaryOp(SetVectorType &Worklist,
&AMDGPU::SGPR_32RegClass;

const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
const TargetRegisterClass *Src1RC = Src1.isReg() ?
MRI.getRegClass(Src1.getReg()) :
&AMDGPU::SGPR_32RegClass;
const TargetRegisterClass *Src1RC =
Src1.isReg() ? RI.getRegClassForReg(MRI, Src1.getReg())
: &AMDGPU::SGPR_32RegClass;

const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0);

Expand Down Expand Up @@ -6086,7 +6119,8 @@ void SIInstrInfo::movePackToVALU(SetVectorType &Worklist,

void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op,
MachineInstr &SCCDefInst,
SetVectorType &Worklist) const {
SetVectorType &Worklist,
Register NewCond) const {
// Ensure that def inst defines SCC, which is still live.
assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&
!Op.isDead() && Op.getParent() == &SCCDefInst);
Expand All @@ -6097,23 +6131,18 @@ void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op,
make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)),
SCCDefInst.getParent()->end())) {
// Check if SCC is used first.
if (MI.findRegisterUseOperandIdx(AMDGPU::SCC, false, &RI) != -1) {
int SCCIdx = MI.findRegisterUseOperandIdx(AMDGPU::SCC, false, &RI);
if (SCCIdx != -1) {
if (MI.isCopy()) {
MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
unsigned DestReg = MI.getOperand(0).getReg();
SmallVector<MachineInstr *, 4> Users;
for (auto &User : MRI.use_nodbg_instructions(DestReg)) {
if ((User.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO) ||
(User.getOpcode() == AMDGPU::S_SUB_CO_PSEUDO)) {
Users.push_back(&User);
Worklist.insert(&User);
}
}
for (auto &U : Users)
U->getOperand(4).setReg(RI.getVCC());
MRI.replaceRegWith(DestReg, NewCond);
CopyToDelete.push_back(&MI);
} else
} else {
if (NewCond.isValid())
MI.getOperand(SCCIdx).setReg(NewCond);
Worklist.insert(&MI);
}
}
// Exit if we find another SCC def.
if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) != -1)
Expand Down
3 changes: 2 additions & 1 deletion llvm/lib/Target/AMDGPU/SIInstrInfo.h
Expand Up @@ -124,7 +124,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {

void addSCCDefUsersToVALUWorklist(MachineOperand &Op,
MachineInstr &SCCDefInst,
SetVectorType &Worklist) const;
SetVectorType &Worklist,
Register NewCond = Register()) const;

const TargetRegisterClass *
getDestEquivalentVGPRClass(const MachineInstr &Inst) const;
Expand Down
10 changes: 1 addition & 9 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.td
Expand Up @@ -571,15 +571,7 @@ def atomic_store_local_64_m0 : PatFrag <
def si_setcc_uniform : PatFrag <
(ops node:$lhs, node:$rhs, node:$cond),
(setcc node:$lhs, node:$rhs, node:$cond), [{
for (SDNode *Use : N->uses()) {
if (Use->isMachineOpcode() || Use->getOpcode() != ISD::CopyToReg)
return false;

unsigned Reg = cast<RegisterSDNode>(Use->getOperand(1))->getReg();
if (Reg != AMDGPU::SCC)
return false;
}
return true;
return !N->isDivergent();
}]>;

//===----------------------------------------------------------------------===//
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll
Expand Up @@ -57,8 +57,8 @@ entry:
}

; FUNC-LABEL: {{^}}null_32bit_lds_ptr:
; SI: v_cmp_ne_u32
; SI-NOT: v_cmp_ne_u32
; SI: s_cmp_lg_u32
; SI: s_cselect_b64 vcc, 1, 0
; SI: v_cndmask_b32
define amdgpu_kernel void @null_32bit_lds_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %lds) nounwind {
%cmp = icmp ne i32 addrspace(3)* %lds, null
Expand Down
18 changes: 12 additions & 6 deletions llvm/test/CodeGen/AMDGPU/addrspacecast.ll
Expand Up @@ -10,7 +10,8 @@
; CI-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}}
; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10{{$}}
; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]]
; CI-DAG: v_cmp_ne_u32_e64 vcc, [[PTR]], -1
; CI-DAG: s_cmp_lg_u32 [[PTR]], -1
; CI-DAG: s_cselect_b64 vcc, 1, 0
; CI-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
; CI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
; CI-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
Expand All @@ -22,7 +23,8 @@
; GFX9-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[SSRC_SHARED_BASE]]

; GFX9-XXX: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_shared_base
; GFX9: v_cmp_ne_u32_e64 vcc, [[PTR]], -1
; GFX9: s_cmp_lg_u32 [[PTR]], -1
; GFX9: s_cselect_b64 vcc, 1, 0
; GFX9: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
; GFX9-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
; GFX9-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
Expand Down Expand Up @@ -76,7 +78,8 @@ define void @use_group_to_flat_addrspacecast_func(i32 addrspace(3)* %ptr) #0 {
; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]]

; CI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
; CI-DAG: v_cmp_ne_u32_e64 vcc, [[PTR]], -1
; CI-DAG: s_cmp_lg_u32 [[PTR]], -1
; CI-DAG: s_cselect_b64 vcc, 1, 0
; CI-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
; CI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
; CI-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
Expand All @@ -89,7 +92,8 @@ define void @use_group_to_flat_addrspacecast_func(i32 addrspace(3)* %ptr) #0 {
; GFX9-XXX: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_private_base

; GFX9-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
; GFX9: v_cmp_ne_u32_e64 vcc, [[PTR]], -1
; GFX9: s_cmp_lg_u32 [[PTR]], -1
; GFX9: s_cselect_b64 vcc, 1, 0
; GFX9: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
; GFX9: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
; GFX9-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
Expand Down Expand Up @@ -148,7 +152,8 @@ define amdgpu_kernel void @use_constant_to_global_addrspacecast(i32 addrspace(4)
; HSA: enable_sgpr_queue_ptr = 0

; HSA: s_load_dwordx2 s{{\[}}[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]{{\]}}
; HSA-DAG: v_cmp_ne_u64_e64 vcc, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0{{$}}
; CI-DAG: v_cmp_ne_u64_e64 vcc, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0{{$}}
; GFX9-DAG: s_cmp_lg_u64 s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0{{$}}
; HSA-DAG: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], s[[PTR_LO]]
; HSA-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], -1, v[[VPTR_LO]]
; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 0{{$}}
Expand All @@ -165,7 +170,8 @@ define amdgpu_kernel void @use_flat_to_group_addrspacecast(i32* %ptr) #0 {
; HSA: enable_sgpr_queue_ptr = 0

; HSA: s_load_dwordx2 s{{\[}}[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]{{\]}}
; HSA-DAG: v_cmp_ne_u64_e64 vcc, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0{{$}}
; CI-DAG: v_cmp_ne_u64_e64 vcc, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0{{$}}
; GFX9-DAG: s_cmp_lg_u64 s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0{{$}}
; HSA-DAG: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], s[[PTR_LO]]
; HSA-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], -1, v[[VPTR_LO]]
; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 0{{$}}
Expand Down
3 changes: 2 additions & 1 deletion llvm/test/CodeGen/AMDGPU/amdgcn.private-memory.ll
Expand Up @@ -18,7 +18,8 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone

; GCN-ALLOCA: v_add_{{[iu]}}32_e32 [[RESULT:v[0-9]+]], vcc, v{{[0-9]+}}, v0

; GCN-PROMOTE: v_cmp_eq_u32_e64 vcc, [[IN]], 1
; GCN-PROMOTE: s_cmp_eq_u32 [[IN]], 1
; GCN-PROMOTE: s_cselect_b64 vcc, 1, 0
; GCN-PROMOTE-NEXT: v_addc_u32_e32 [[RESULT:v[0-9]+]], vcc, 0, v0, vcc

; GCN: buffer_store_dword [[RESULT]]
Expand Down

0 comments on commit 521ac0b

Please sign in to comment.