Skip to content

Commit

Permalink
AMDGPU/GlobalISel: Select llvm.amdgcn.raw.buffer.load
Browse files Browse the repository at this point in the history
Use intermediate instructions, unlike with buffer stores. This is
necessary because of the need to have an internal way to distinguish
between signed and unsigned extloads. This introduces some duplication
and near duplication with the buffer store selection path. The store
handling should maybe be moved into legalization to match and
eliminate the duplication.
  • Loading branch information
arsenm committed Jan 27, 2020
1 parent e60d658 commit fc90222
Show file tree
Hide file tree
Showing 10 changed files with 1,081 additions and 25 deletions.
17 changes: 17 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUGISel.td
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,11 @@ def : GINodeEquiv<G_ATOMICRMW_FADD, atomic_load_fadd_glue>;

def : GINodeEquiv<G_AMDGPU_FFBH_U32, AMDGPUffbh_u32>;
def : GINodeEquiv<G_AMDGPU_ATOMIC_CMPXCHG, AMDGPUatomic_cmp_swap>;
def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD, SIbuffer_load>;
def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_USHORT, SIbuffer_load_ushort>;
def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_UBYTE, SIbuffer_load_ubyte>;
def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_SSHORT, SIbuffer_load_short>;
def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_SBYTE, SIbuffer_load_byte>;

// FIXME: Check MMO is atomic
def : GINodeEquiv<G_AMDGPU_ATOMIC_INC, SIatomic_inc>;
Expand Down Expand Up @@ -238,3 +243,15 @@ def gi_bitcast_fpimm_to_i32 : GICustomOperandRenderer<"renderBitcastImm">,

def gi_IMMPopCount : GICustomOperandRenderer<"renderPopcntImm">,
GISDNodeXFormEquiv<IMMPopCount>;

def gi_extract_glc : GICustomOperandRenderer<"renderExtractGLC">,
GISDNodeXFormEquiv<extract_glc>;

def gi_extract_slc : GICustomOperandRenderer<"renderExtractSLC">,
GISDNodeXFormEquiv<extract_slc>;

def gi_extract_dlc : GICustomOperandRenderer<"renderExtractDLC">,
GISDNodeXFormEquiv<extract_dlc>;

def gi_extract_swz : GICustomOperandRenderer<"renderExtractSWZ">,
GISDNodeXFormEquiv<extract_swz>;
28 changes: 28 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2903,6 +2903,34 @@ void AMDGPUInstructionSelector::renderTruncTImm1(MachineInstrBuilder &MIB,
MIB.addImm(MI.getOperand(OpIdx).getImm());
}

void AMDGPUInstructionSelector::renderExtractGLC(MachineInstrBuilder &MIB,
const MachineInstr &MI,
int OpIdx) const {
assert(OpIdx >= 0 && "expected to match an immediate operand");
MIB.addImm(MI.getOperand(OpIdx).getImm() & 1);
}

void AMDGPUInstructionSelector::renderExtractSLC(MachineInstrBuilder &MIB,
const MachineInstr &MI,
int OpIdx) const {
assert(OpIdx >= 0 && "expected to match an immediate operand");
MIB.addImm((MI.getOperand(OpIdx).getImm() >> 1) & 1);
}

void AMDGPUInstructionSelector::renderExtractDLC(MachineInstrBuilder &MIB,
const MachineInstr &MI,
int OpIdx) const {
assert(OpIdx >= 0 && "expected to match an immediate operand");
MIB.addImm((MI.getOperand(OpIdx).getImm() >> 2) & 1);
}

void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
const MachineInstr &MI,
int OpIdx) const {
assert(OpIdx >= 0 && "expected to match an immediate operand");
MIB.addImm((MI.getOperand(OpIdx).getImm() >> 3) & 1);
}

bool AMDGPUInstructionSelector::isInlineImmediate16(int64_t Imm) const {
return AMDGPU::isInlinableLiteral16(Imm, STI.hasInv2PiInlineImm());
}
Expand Down
8 changes: 8 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,14 @@ class AMDGPUInstructionSelector : public InstructionSelector {

void renderPopcntImm(MachineInstrBuilder &MIB, const MachineInstr &MI,
int OpIdx) const;
void renderExtractGLC(MachineInstrBuilder &MIB, const MachineInstr &MI,
int OpIdx) const;
void renderExtractSLC(MachineInstrBuilder &MIB, const MachineInstr &MI,
int OpIdx) const;
void renderExtractDLC(MachineInstrBuilder &MIB, const MachineInstr &MI,
int OpIdx) const;
void renderExtractSWZ(MachineInstrBuilder &MIB, const MachineInstr &MI,
int OpIdx) const;

bool isInlineImmediate16(int64_t Imm) const;
bool isInlineImmediate32(int64_t Imm) const;
Expand Down
124 changes: 122 additions & 2 deletions llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,15 @@
#define _USE_MATH_DEFINES
#endif

#include "AMDGPU.h"
#include "AMDGPULegalizerInfo.h"

#include "AMDGPU.h"
#include "AMDGPUGlobalISelUtils.h"
#include "AMDGPUTargetMachine.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/TargetOpcodes.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/DerivedTypes.h"
Expand All @@ -37,7 +40,7 @@ using namespace llvm;
using namespace LegalizeActions;
using namespace LegalizeMutations;
using namespace LegalityPredicates;

using namespace MIPatternMatch;

static LegalityPredicate isMultiple32(unsigned TypeIdx,
unsigned MaxSize = 1024) {
Expand Down Expand Up @@ -2327,6 +2330,55 @@ bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
return true;
}

// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
// offset (the offset that is included in bounds checking and swizzling, to be
// split between the instruction's voffset and immoffset fields) and soffset
// (the offset that is excluded from bounds checking and swizzling, to go in
// the instruction's soffset field). This function takes the first kind of
// offset and figures out how to split it between voffset and immoffset.
std::tuple<Register, unsigned, unsigned>
AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
Register OrigOffset) const {
const unsigned MaxImm = 4095;
Register BaseReg;
unsigned TotalConstOffset;
MachineInstr *OffsetDef;
const LLT S32 = LLT::scalar(32);

std::tie(BaseReg, TotalConstOffset, OffsetDef)
= AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);

unsigned ImmOffset = TotalConstOffset;

// If the immediate value is too big for the immoffset field, put the value
// and -4096 into the immoffset field so that the value that is copied/added
// for the voffset field is a multiple of 4096, and it stands more chance
// of being CSEd with the copy/add for another similar load/store.
// However, do not do that rounding down to a multiple of 4096 if that is a
// negative number, as it appears to be illegal to have a negative offset
// in the vgpr, even if adding the immediate offset makes it positive.
unsigned Overflow = ImmOffset & ~MaxImm;
ImmOffset -= Overflow;
if ((int32_t)Overflow < 0) {
Overflow += ImmOffset;
ImmOffset = 0;
}

if (Overflow != 0) {
if (!BaseReg) {
BaseReg = B.buildConstant(S32, Overflow).getReg(0);
} else {
auto OverflowVal = B.buildConstant(S32, Overflow);
BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
}
}

if (!BaseReg)
BaseReg = B.buildConstant(S32, 0).getReg(0);

return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
}

/// Handle register layout difference for f16 images for some subtargets.
Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
MachineRegisterInfo &MRI,
Expand Down Expand Up @@ -2383,6 +2435,72 @@ bool AMDGPULegalizerInfo::legalizeRawBufferStore(MachineInstr &MI,
return Ty == S32;
}

bool AMDGPULegalizerInfo::legalizeRawBufferLoad(MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &B,
bool IsFormat) const {
B.setInstr(MI);

// FIXME: Verifier should enforce 1 MMO for these intrinsics.
MachineMemOperand *MMO = *MI.memoperands_begin();
const int MemSize = MMO->getSize();
const LLT S32 = LLT::scalar(32);

Register Dst = MI.getOperand(0).getReg();
Register RSrc = MI.getOperand(2).getReg();
Register VOffset = MI.getOperand(3).getReg();
Register SOffset = MI.getOperand(4).getReg();
unsigned AuxiliaryData = MI.getOperand(5).getImm();
unsigned ImmOffset;
unsigned TotalOffset;

std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
if (TotalOffset != 0)
MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);

unsigned Opc;
switch (MemSize) {
case 1:
if (IsFormat)
return false;
Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
break;
case 2:
if (IsFormat)
return false;
Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
break;
default:
Opc = IsFormat ? -1/*TODO*/ : AMDGPU::G_AMDGPU_BUFFER_LOAD;
break;
}

Register LoadDstReg = MemSize >= 4 ? Dst :
B.getMRI()->createGenericVirtualRegister(S32);

Register VIndex = B.buildConstant(S32, 0).getReg(0);

B.buildInstr(Opc)
.addDef(LoadDstReg) // vdata
.addUse(RSrc) // rsrc
.addUse(VIndex) // vindex
.addUse(VOffset) // voffset
.addUse(SOffset) // soffset
.addImm(ImmOffset) // offset(imm)
.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
.addImm(0) // idxen(imm)
.addMemOperand(MMO);

if (LoadDstReg != Dst) {
// Widen result for extending loads was widened.
B.setInsertPt(B.getMBB(), ++B.getInsertPt());
B.buildTrunc(Dst, LoadDstReg);
}

MI.eraseFromParent();
return true;
}

bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
MachineIRBuilder &B,
bool IsInc) const {
Expand Down Expand Up @@ -2517,6 +2635,8 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
return legalizeRawBufferStore(MI, MRI, B, false);
case Intrinsic::amdgcn_raw_buffer_store_format:
return legalizeRawBufferStore(MI, MRI, B, true);
case Intrinsic::amdgcn_raw_buffer_load:
return legalizeRawBufferLoad(MI, MRI, B, false);
case Intrinsic::amdgcn_atomic_inc:
return legalizeAtomicIncDec(MI, B, true);
case Intrinsic::amdgcn_atomic_dec:
Expand Down
6 changes: 5 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -105,11 +105,15 @@ class AMDGPULegalizerInfo : public LegalizerInfo {
bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B, unsigned AddrSpace) const;

std::tuple<Register, unsigned, unsigned>
splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const;

Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI,
Register Reg) const;
bool legalizeRawBufferStore(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B, bool IsFormat) const;

bool legalizeRawBufferLoad(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B, bool IsFormat) const;
bool legalizeAtomicIncDec(MachineInstr &MI, MachineIRBuilder &B,
bool IsInc) const;

Expand Down
32 changes: 29 additions & 3 deletions llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2243,6 +2243,15 @@ void AMDGPURegisterBankInfo::applyMappingImpl(

return;
}
case AMDGPU::G_AMDGPU_BUFFER_LOAD:
case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE: {
applyDefaultMapping(OpdMapper);
executeInWaterfallLoop(MI, MRI, {1, 4});
return;
}
case AMDGPU::G_INTRINSIC: {
switch (MI.getIntrinsicID()) {
case Intrinsic::amdgcn_s_buffer_load: {
Expand Down Expand Up @@ -2325,9 +2334,6 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
constrainOpWithReadfirstlane(MI, MRI, 2); // M0
return;
}
case Intrinsic::amdgcn_raw_buffer_load:
case Intrinsic::amdgcn_raw_buffer_load_format:
case Intrinsic::amdgcn_raw_tbuffer_load:
case Intrinsic::amdgcn_raw_buffer_store:
case Intrinsic::amdgcn_raw_buffer_store_format:
case Intrinsic::amdgcn_raw_tbuffer_store: {
Expand Down Expand Up @@ -3061,6 +3067,26 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
}
break;
}
case AMDGPU::G_AMDGPU_BUFFER_LOAD:
case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT: {
OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);

// rsrc
OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);

// vindex
OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);

// voffset
OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);

// soffset
OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
break;
}
case AMDGPU::G_INTRINSIC: {
switch (MI.getIntrinsicID()) {
default:
Expand Down
8 changes: 4 additions & 4 deletions llvm/lib/Target/AMDGPU/BUFInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -1152,19 +1152,19 @@ let SubtargetPredicate = isGFX10Plus in {
// MUBUF Patterns
//===----------------------------------------------------------------------===//

def extract_glc : SDNodeXForm<imm, [{
def extract_glc : SDNodeXForm<timm, [{
return CurDAG->getTargetConstant(N->getZExtValue() & 1, SDLoc(N), MVT::i8);
}]>;

def extract_slc : SDNodeXForm<imm, [{
def extract_slc : SDNodeXForm<timm, [{
return CurDAG->getTargetConstant((N->getZExtValue() >> 1) & 1, SDLoc(N), MVT::i8);
}]>;

def extract_dlc : SDNodeXForm<imm, [{
def extract_dlc : SDNodeXForm<timm, [{
return CurDAG->getTargetConstant((N->getZExtValue() >> 2) & 1, SDLoc(N), MVT::i8);
}]>;

def extract_swz : SDNodeXForm<imm, [{
def extract_swz : SDNodeXForm<timm, [{
return CurDAG->getTargetConstant((N->getZExtValue() >> 3) & 1, SDLoc(N), MVT::i8);
}]>;

Expand Down
15 changes: 15 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -2146,6 +2146,21 @@ def G_AMDGPU_FFBH_U32 : AMDGPUGenericInstruction {
let hasSideEffects = 0;
}

class BufferLoadGenericInstruction : AMDGPUGenericInstruction {
let OutOperandList = (outs type0:$dst);
let InOperandList = (ins type1:$rsrc, type2:$vindex, type2:$voffset,
type2:$soffset, untyped_imm_0:$offset,
untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen);
let hasSideEffects = 0;
let mayLoad = 1;
}

def G_AMDGPU_BUFFER_LOAD_UBYTE : BufferLoadGenericInstruction;
def G_AMDGPU_BUFFER_LOAD_SBYTE : BufferLoadGenericInstruction;
def G_AMDGPU_BUFFER_LOAD_USHORT : BufferLoadGenericInstruction;
def G_AMDGPU_BUFFER_LOAD_SSHORT : BufferLoadGenericInstruction;
def G_AMDGPU_BUFFER_LOAD : BufferLoadGenericInstruction;

// Atomic cmpxchg. $cmpval ad $newval are packed in a single vector
// operand Expects a MachineMemOperand in addition to explicit
// operands.
Expand Down
Loading

0 comments on commit fc90222

Please sign in to comment.