Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AMDGPU] New AMDGPUInsertSingleUseVDST pass #72388

Merged
merged 6 commits into from
Nov 24, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPU.h
Original file line number Diff line number Diff line change
Expand Up @@ -335,6 +335,9 @@ extern char &SIModeRegisterID;
void initializeAMDGPUInsertDelayAluPass(PassRegistry &);
extern char &AMDGPUInsertDelayAluID;

void initializeAMDGPUInsertSingleUseVDSTPass(PassRegistry &);
extern char &AMDGPUInsertSingleUseVDSTID;

void initializeSIInsertHardClausesPass(PassRegistry &);
extern char &SIInsertHardClausesID;

Expand Down
122 changes: 122 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
//===- AMDGPUInsertSingleUseVDST.cpp - Insert s_singleuse_vdst instructions ==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file
/// Insert s_singleuse_vdst instructions on GFX11.5+ to mark regions of VALU
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Currently this only marks regions of 1?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Correct. There is more work to be done here.

/// instructions that produce single-use VGPR values. If the value is forwarded
/// to the consumer instruction prior to VGPR writeback, the hardware can
/// then skip (kill) the VGPR write.
//
//===----------------------------------------------------------------------===//

#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIInstrInfo.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/Register.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/MC/MCRegister.h"
#include "llvm/Pass.h"

using namespace llvm;

#define DEBUG_TYPE "amdgpu-insert-single-use-vdst"

namespace {
class AMDGPUInsertSingleUseVDST : public MachineFunctionPass {
private:
const SIInstrInfo *SII;

public:
static char ID;

AMDGPUInsertSingleUseVDST() : MachineFunctionPass(ID) {}

void emitSingleUseVDST(MachineInstr &MI) const {
// Mark the following instruction as a single-use producer:
// s_singleuse_vdst { supr0: 1 }
BuildMI(*MI.getParent(), MI, DebugLoc(), SII->get(AMDGPU::S_SINGLEUSE_VDST))
.addImm(0x1);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What does the immediate mean?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've commented the specific case we're using. In general it can indicate up to three regions of "single-use producer" VALU instructions but I don't want to add support for the encoding of them until we're actually using it. There is also a pretty assembler/disassembler syntax for that which is not yet implemented.

}

bool runOnMachineFunction(MachineFunction &MF) override {
const auto &ST = MF.getSubtarget<GCNSubtarget>();
if (!ST.hasVGPRSingleUseHintInsts())
return false;

SII = ST.getInstrInfo();
const auto *TRI = &SII->getRegisterInfo();
bool InstructionEmitted = false;

for (MachineBasicBlock &MBB : MF) {
DenseMap<MCPhysReg, unsigned> RegisterUseCount; // TODO: MCRegUnits

// Handle boundaries at the end of basic block separately to avoid
// false positives. If they are live at the end of a basic block then
// assume it has more uses later on.
for (const auto &Liveouts : MBB.liveouts())
RegisterUseCount[Liveouts.PhysReg] = 2;

for (MachineInstr &MI : reverse(MBB.instrs())) {
// All registers in all operands need to be single use for an
// instruction to be marked as a single use producer.
bool AllProducerOperandsAreSingleUse = true;

for (const auto &Operand : MI.operands()) {
if (!Operand.isReg())
continue;
const auto Reg = Operand.getReg();

// Count the number of times each register is read.
if (Operand.readsReg())
RegisterUseCount[Reg]++;

// Do not attempt to optimise across exec mask changes.
if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
for (auto &UsedReg : RegisterUseCount)
UsedReg.second = 2;
}

// If we are at the point where the register first became live,
// check if the operands are single use.
if (!MI.modifiesRegister(Reg, TRI))
continue;
if (RegisterUseCount[Reg] > 1)
AllProducerOperandsAreSingleUse = false;
// Reset uses count when a register is no longer live.
RegisterUseCount.erase(Reg);
}
if (AllProducerOperandsAreSingleUse && SIInstrInfo::isVALU(MI)) {
// TODO: Replace with candidate logging for instruction grouping
// later.
emitSingleUseVDST(MI);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this work with bundles -- do that need a test?

InstructionEmitted = true;
}
}
}
return InstructionEmitted;
}
};
} // namespace

char AMDGPUInsertSingleUseVDST::ID = 0;

char &llvm::AMDGPUInsertSingleUseVDSTID = AMDGPUInsertSingleUseVDST::ID;

INITIALIZE_PASS(AMDGPUInsertSingleUseVDST, DEBUG_TYPE,
"AMDGPU Insert SingleUseVDST", false, false)
10 changes: 10 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,12 @@ static cl::opt<bool> EnableSIModeRegisterPass(
cl::init(true),
cl::Hidden);

// Enable GFX11.5+ s_singleuse_vdst insertion
static cl::opt<bool>
EnableInsertSingleUseVDST("amdgpu-enable-single-use-vdst",
cl::desc("Enable s_singleuse_vdst insertion"),
cl::init(false), cl::Hidden);

// Enable GFX11+ s_delay_alu insertion
static cl::opt<bool>
EnableInsertDelayAlu("amdgpu-enable-delay-alu",
Expand Down Expand Up @@ -404,6 +410,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPURewriteUndefForPHILegacyPass(*PR);
initializeAMDGPUUnifyMetadataPass(*PR);
initializeSIAnnotateControlFlowPass(*PR);
initializeAMDGPUInsertSingleUseVDSTPass(*PR);
initializeAMDGPUInsertDelayAluPass(*PR);
initializeSIInsertHardClausesPass(*PR);
initializeSIInsertWaitcntsPass(*PR);
Expand Down Expand Up @@ -1448,6 +1455,9 @@ void GCNPassConfig::addPreEmitPass() {
// cases.
addPass(&PostRAHazardRecognizerID);

if (isPassEnabled(EnableInsertSingleUseVDST, CodeGenOptLevel::Less))
addPass(&AMDGPUInsertSingleUseVDSTID);

if (isPassEnabled(EnableInsertDelayAlu, CodeGenOptLevel::Less))
addPass(&AMDGPUInsertDelayAluID);

Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUMacroFusion.cpp
AMDGPUMCInstLower.cpp
AMDGPUIGroupLP.cpp
AMDGPUInsertSingleUseVDST.cpp
AMDGPUMIRFormatter.cpp
AMDGPUOpenCLEnqueuedBlockLowering.cpp
AMDGPUPerfHintAnalysis.cpp
Expand Down