Skip to content

Commit

Permalink
[AMDGPU][GFX11] Mitigate VALU mask write hazard
Browse files Browse the repository at this point in the history
VALU use of an SGPR (pair) as mask followed by SALU write to the
same SGPR can cause incorrect execution of subsequent SALU reads
of the SGPR.

Reviewed By: foad, rampitec

Differential Revision: https://reviews.llvm.org/D134151
  • Loading branch information
perlfu committed Oct 1, 2022
1 parent a5c46bf commit a35013b
Show file tree
Hide file tree
Showing 4 changed files with 701 additions and 0 deletions.
138 changes: 138 additions & 0 deletions llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1102,6 +1102,7 @@ void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
fixVALUTransUseHazard(MI);
fixWMMAHazards(MI);
fixShift64HighRegBug(MI);
fixVALUMaskWriteHazard(MI);
}

bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
Expand Down Expand Up @@ -2709,3 +2710,140 @@ bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) {

return false;
}

bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
if (!ST.isWave64())
return false;
if (!ST.hasVALUMaskWriteHazard())
return false;
if (!SIInstrInfo::isSALU(*MI))
return false;

// The hazard sequence is three instructions:
// 1. VALU reads SGPR as mask
// 2. SALU writes SGPR
// 3. SALU reads SGPR
// The hazard can expire if the distance between 2 and 3 is sufficient.
// In practice this happens <10% of the time, hence this always assumes
// the hazard exists if 1 and 2 are present to avoid searching.

const MachineOperand *SDSTOp = TII.getNamedOperand(*MI, AMDGPU::OpName::sdst);
if (!SDSTOp || !SDSTOp->isReg())
return false;

const Register HazardReg = SDSTOp->getReg();
if (HazardReg == AMDGPU::EXEC ||
HazardReg == AMDGPU::EXEC_LO ||
HazardReg == AMDGPU::EXEC_HI ||
HazardReg == AMDGPU::M0)
return false;

auto IsHazardFn = [HazardReg, this](const MachineInstr &I) {
switch (I.getOpcode()) {
case AMDGPU::V_ADDC_U32_e32:
case AMDGPU::V_ADDC_U32_dpp:
case AMDGPU::V_CNDMASK_B16_e32:
case AMDGPU::V_CNDMASK_B16_dpp:
case AMDGPU::V_CNDMASK_B32_e32:
case AMDGPU::V_CNDMASK_B32_dpp:
case AMDGPU::V_DIV_FMAS_F32_e64:
case AMDGPU::V_DIV_FMAS_F64_e64:
case AMDGPU::V_SUBB_U32_e32:
case AMDGPU::V_SUBB_U32_dpp:
case AMDGPU::V_SUBBREV_U32_e32:
case AMDGPU::V_SUBBREV_U32_dpp:
// These implicitly read VCC as mask source.
return HazardReg == AMDGPU::VCC ||
HazardReg == AMDGPU::VCC_LO ||
HazardReg == AMDGPU::VCC_HI;
case AMDGPU::V_ADDC_U32_e64:
case AMDGPU::V_ADDC_U32_e64_dpp:
case AMDGPU::V_CNDMASK_B16_e64:
case AMDGPU::V_CNDMASK_B16_e64_dpp:
case AMDGPU::V_CNDMASK_B32_e64:
case AMDGPU::V_CNDMASK_B32_e64_dpp:
case AMDGPU::V_SUBB_U32_e64:
case AMDGPU::V_SUBB_U32_e64_dpp:
case AMDGPU::V_SUBBREV_U32_e64:
case AMDGPU::V_SUBBREV_U32_e64_dpp: {
// Only check mask register overlaps.
const MachineOperand *SSRCOp = TII.getNamedOperand(I, AMDGPU::OpName::src2);
assert(SSRCOp);
return TRI.regsOverlap(SSRCOp->getReg(), HazardReg);
}
default:
return false;
}
};

const MachineRegisterInfo &MRI = MF.getRegInfo();
auto IsExpiredFn = [&MRI, this](const MachineInstr &I, int) {
// s_waitcnt_depctr sa_sdst(0) mitigates hazard.
if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
!(I.getOperand(0).getImm() & 0x1))
return true;

// VALU access to any SGPR or literal constant other than HazardReg
// mitigates hazard. No need to check HazardReg here as this will
// only be called when !IsHazardFn.
if (!SIInstrInfo::isVALU(I))
return false;
for (int OpNo = 0, End = I.getNumOperands(); OpNo < End; ++OpNo) {
const MachineOperand &Op = I.getOperand(OpNo);
if (Op.isReg()) {
Register OpReg = Op.getReg();
// Only consider uses
if (!Op.isUse())
continue;
// Ignore EXEC
if (OpReg == AMDGPU::EXEC ||
OpReg == AMDGPU::EXEC_LO ||
OpReg == AMDGPU::EXEC_HI)
continue;
// Ignore all implicit uses except VCC
if (Op.isImplicit()) {
if (OpReg == AMDGPU::VCC ||
OpReg == AMDGPU::VCC_LO ||
OpReg == AMDGPU::VCC_HI)
return true;
continue;
}
if (TRI.isSGPRReg(MRI, OpReg))
return true;
} else {
const MCInstrDesc &InstDesc = I.getDesc();
const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpNo];
if (TII.isLiteralConstant(Op, OpInfo))
return true;
}
}
return false;
};

// Check for hazard
if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
std::numeric_limits<int>::max())
return false;

auto NextMI = std::next(MI->getIterator());

// Add s_waitcnt_depctr sa_sdst(0) after SALU write.
BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
TII.get(AMDGPU::S_WAITCNT_DEPCTR))
.addImm(0xfffe);

// SALU write may be s_getpc in a bundle.
if (MI->getOpcode() == AMDGPU::S_GETPC_B64) {
// Update offsets of any references in the bundle.
while (NextMI != MI->getParent()->end() &&
NextMI->isBundledWithPred()) {
for (auto &Operand : NextMI->operands()) {
if (Operand.isGlobal())
Operand.setOffset(Operand.getOffset() + 4);
}
NextMI++;
}
}

return true;
}
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
bool fixVALUTransUseHazard(MachineInstr *MI);
bool fixWMMAHazards(MachineInstr *MI);
bool fixShift64HighRegBug(MachineInstr *MI);
bool fixVALUMaskWriteHazard(MachineInstr *MI);

int checkMAIHazards(MachineInstr *MI);
int checkMAIHazards908(MachineInstr *MI);
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/GCNSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -1058,6 +1058,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,

bool hasVALUTransUseHazard() const { return getGeneration() >= GFX11; }

bool hasVALUMaskWriteHazard() const { return getGeneration() >= GFX11; }

/// Return if operations acting on VGPR tuples require even alignment.
bool needsAlignedVGPRs() const { return GFX90AInsts; }

Expand Down
Loading

0 comments on commit a35013b

Please sign in to comment.