Skip to content

Commit

Permalink
[AMDGPU] CodeGen for SMEM instructions (#75579)
Browse files Browse the repository at this point in the history
  • Loading branch information
mbrkusanin committed Dec 15, 2023
1 parent 70579c9 commit a278ac5
Show file tree
Hide file tree
Showing 30 changed files with 10,826 additions and 667 deletions.
12 changes: 12 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -828,6 +828,12 @@ def FeaturePseudoScalarTrans : SubtargetFeature<"pseudo-scalar-trans",
"Has Pseudo Scalar Transcendental instructions"
>;

def FeatureHasRestrictedSOffset : SubtargetFeature<"restricted-soffset",
"HasRestrictedSOffset",
"true",
"Has restricted SOffset (immediate not supported)."
>;

//===------------------------------------------------------------===//
// Subtarget Features (options and debugging)
//===------------------------------------------------------------===//
Expand Down Expand Up @@ -1474,6 +1480,7 @@ def FeatureISAVersion12 : FeatureSet<
FeatureVcmpxPermlaneHazard,
FeatureSALUFloatInsts,
FeaturePseudoScalarTrans,
FeatureHasRestrictedSOffset,
FeatureVGPRSingleUseHintInsts,
FeatureMADIntraFwdBug,
FeatureScalarDwordx3Loads]>;
Expand Down Expand Up @@ -1787,6 +1794,11 @@ def HasUnpackedD16VMem : Predicate<"Subtarget->hasUnpackedD16VMem()">,
def HasPackedD16VMem : Predicate<"!Subtarget->hasUnpackedD16VMem()">,
AssemblerPredicate<(all_of (not FeatureUnpackedD16VMem))>;

def HasRestrictedSOffset : Predicate<"Subtarget->hasRestrictedSOffset()">,
AssemblerPredicate<(all_of FeatureHasRestrictedSOffset)>;
def HasUnrestrictedSOffset : Predicate<"!Subtarget->hasRestrictedSOffset()">,
AssemblerPredicate<(all_of (not FeatureHasRestrictedSOffset))>;

def D16PreservesUnusedBits :
Predicate<"Subtarget->d16PreservesUnusedBits()">,
AssemblerPredicate<(all_of FeatureGFX9Insts, (not FeatureSRAMECC))>;
Expand Down
8 changes: 4 additions & 4 deletions llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -460,8 +460,8 @@ static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
return false;

// If we have 96-bit memory operations, we shouldn't touch them. Note we may
// end up widening these for a scalar load during RegBankSelect, since there
// aren't 96-bit scalar loads.
// end up widening these for a scalar load during RegBankSelect, if we don't
// have 96-bit scalar loads.
if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
return false;

Expand Down Expand Up @@ -6467,10 +6467,10 @@ bool AMDGPULegalizerInfo::legalizeSBufferLoad(
MemSize, MemAlign);
MI.addMemOperand(MF, MMO);

// There are no 96-bit result scalar loads, but widening to 128-bit should
// If we don't have 96-bit result scalar loads, widening to 128-bit should
// always be legal. We may need to restore this to a 96-bit result if it turns
// out this needs to be converted to a vector load during RegBankSelect.
if (!isPowerOf2_32(Size)) {
if (!isPowerOf2_32(Size) && (Size != 96 || !ST.hasScalarDwordx3Loads())) {
if (Ty.isVector())
Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
else
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1061,7 +1061,7 @@ bool AMDGPURegisterBankInfo::applyMappingLoad(
if (DstBank == &AMDGPU::SGPRRegBank) {
// There are some special cases that we need to look at for 32 bit and 96
// bit SGPR loads otherwise we have nothing to do.
if (LoadSize != 32 && LoadSize != 96)
if (LoadSize != 32 && (LoadSize != 96 || Subtarget.hasScalarDwordx3Loads()))
return false;

MachineMemOperand *MMO = *MI.memoperands_begin();
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/GCNSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool HasSALUFloatInsts = false;
bool HasVGPRSingleUseHintInsts = false;
bool HasPseudoScalarTrans = false;
bool HasRestrictedSOffset = false;

bool HasVcmpxPermlaneHazard = false;
bool HasVMEMtoScalarWriteHazard = false;
Expand Down Expand Up @@ -1163,6 +1164,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,

bool hasPseudoScalarTrans() const { return HasPseudoScalarTrans; }

bool hasRestrictedSOffset() const { return HasRestrictedSOffset; }

/// Return the maximum number of waves per SIMD for kernels using \p SGPRs
/// SGPRs
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
Expand Down
90 changes: 64 additions & 26 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1438,11 +1438,15 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
// On VI, these use the SMEM format and the offset is 20-bit in bytes.
if (!isUInt<20>(AM.BaseOffs))
return false;
} else {
} else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
// On GFX9 the offset is signed 21-bit in bytes (but must not be negative
// for S_BUFFER_* instructions).
if (!isInt<21>(AM.BaseOffs))
return false;
} else {
// On GFX12, all offsets are signed 24-bit in bytes.
if (!isInt<24>(AM.BaseOffs))
return false;
}

if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
Expand Down Expand Up @@ -7497,7 +7501,8 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
};

// Widen vec3 load to vec4.
if (VT.isVector() && VT.getVectorNumElements() == 3) {
if (VT.isVector() && VT.getVectorNumElements() == 3 &&
!Subtarget->hasScalarDwordx3Loads()) {
EVT WidenedVT =
EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 4);
auto WidenedOp = DAG.getMemIntrinsicNode(
Expand Down Expand Up @@ -7913,6 +7918,19 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
}
}

// On targets not supporting constant in soffset field, turn zero to
// SGPR_NULL to avoid generating an extra s_mov with zero.
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG,
const GCNSubtarget *Subtarget) {
if (Subtarget->hasRestrictedSOffset())
if (auto SOffsetConst = dyn_cast<ConstantSDNode>(SOffset)) {
if (SOffsetConst->isZero()) {
return DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32);
}
}
return SOffset;
}

SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
SelectionDAG &DAG,
unsigned NewOpcode) const {
Expand All @@ -7921,13 +7939,14 @@ SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
SDValue VData = Op.getOperand(2);
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
SDValue Ops[] = {
Op.getOperand(0), // Chain
VData, // vdata
Rsrc, // rsrc
DAG.getConstant(0, DL, MVT::i32), // vindex
Offsets.first, // voffset
Op.getOperand(5), // soffset
SOffset, // soffset
Offsets.second, // offset
Op.getOperand(6), // cachepolicy
DAG.getTargetConstant(0, DL, MVT::i1), // idxen
Expand All @@ -7954,13 +7973,14 @@ SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
SDValue VData = Op.getOperand(2);
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
SDValue Ops[] = {
Op.getOperand(0), // Chain
VData, // vdata
Rsrc, // rsrc
Op.getOperand(4), // vindex
Offsets.first, // voffset
Op.getOperand(6), // soffset
SOffset, // soffset
Offsets.second, // offset
Op.getOperand(7), // cachepolicy
DAG.getTargetConstant(1, DL, MVT::i1), // idxen
Expand Down Expand Up @@ -8116,12 +8136,13 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,

SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
SDValue Ops[] = {
Op.getOperand(0), // Chain
Rsrc, // rsrc
DAG.getConstant(0, DL, MVT::i32), // vindex
Offsets.first, // voffset
Op.getOperand(4), // soffset
SOffset, // soffset
Offsets.second, // offset
Op.getOperand(5), // cachepolicy, swizzled buffer
DAG.getTargetConstant(0, DL, MVT::i1), // idxen
Expand All @@ -8140,12 +8161,13 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,

SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
SDValue Ops[] = {
Op.getOperand(0), // Chain
Rsrc, // rsrc
Op.getOperand(3), // vindex
Offsets.first, // voffset
Op.getOperand(5), // soffset
SOffset, // soffset
Offsets.second, // offset
Op.getOperand(6), // cachepolicy, swizzled buffer
DAG.getTargetConstant(1, DL, MVT::i1), // idxen
Expand All @@ -8157,21 +8179,22 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
MemSDNode *M = cast<MemSDNode>(Op);
EVT LoadVT = Op.getValueType();

auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
unsigned Dfmt = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(8))->getZExtValue();
unsigned Glc = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue();
unsigned Slc = cast<ConstantSDNode>(Op.getOperand(10))->getZExtValue();
unsigned IdxEn = getIdxEn(Op.getOperand(3));
SDValue Ops[] = {
Op.getOperand(0), // Chain
Op.getOperand(2), // rsrc
Op.getOperand(3), // vindex
Op.getOperand(4), // voffset
Op.getOperand(5), // soffset
Op.getOperand(6), // offset
DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
DAG.getTargetConstant(IdxEn, DL, MVT::i1) // idxen
Op.getOperand(0), // Chain
Op.getOperand(2), // rsrc
Op.getOperand(3), // vindex
Op.getOperand(4), // voffset
SOffset, // soffset
Op.getOperand(6), // offset
DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
DAG.getTargetConstant(IdxEn, DL, MVT::i1) // idxen
};

if (LoadVT.getScalarType() == MVT::f16)
Expand All @@ -8187,13 +8210,14 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
EVT LoadVT = Op.getValueType();
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);

SDValue Ops[] = {
Op.getOperand(0), // Chain
Rsrc, // rsrc
DAG.getConstant(0, DL, MVT::i32), // vindex
Offsets.first, // voffset
Op.getOperand(4), // soffset
SOffset, // soffset
Offsets.second, // offset
Op.getOperand(5), // format
Op.getOperand(6), // cachepolicy, swizzled buffer
Expand All @@ -8213,13 +8237,14 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
EVT LoadVT = Op.getValueType();
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);

SDValue Ops[] = {
Op.getOperand(0), // Chain
Rsrc, // rsrc
Op.getOperand(3), // vindex
Offsets.first, // voffset
Op.getOperand(5), // soffset
SOffset, // soffset
Offsets.second, // offset
Op.getOperand(6), // format
Op.getOperand(7), // cachepolicy, swizzled buffer
Expand Down Expand Up @@ -8432,14 +8457,15 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
SDValue Ops[] = {
Op.getOperand(0), // Chain
Op.getOperand(2), // src
Op.getOperand(3), // cmp
Rsrc, // rsrc
DAG.getConstant(0, DL, MVT::i32), // vindex
Offsets.first, // voffset
Op.getOperand(6), // soffset
SOffset, // soffset
Offsets.second, // offset
Op.getOperand(7), // cachepolicy
DAG.getTargetConstant(0, DL, MVT::i1), // idxen
Expand All @@ -8454,14 +8480,15 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(6), DAG);
auto SOffset = selectSOffset(Op.getOperand(7), DAG, Subtarget);
SDValue Ops[] = {
Op.getOperand(0), // Chain
Op.getOperand(2), // src
Op.getOperand(3), // cmp
Rsrc, // rsrc
Op.getOperand(5), // vindex
Offsets.first, // voffset
Op.getOperand(7), // soffset
SOffset, // soffset
Offsets.second, // offset
Op.getOperand(8), // cachepolicy
DAG.getTargetConstant(1, DL, MVT::i1), // idxen
Expand Down Expand Up @@ -8893,13 +8920,14 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
VData = handleD16VData(VData, DAG);
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
SDValue Ops[] = {
Chain,
VData, // vdata
Rsrc, // rsrc
Op.getOperand(4), // vindex
Offsets.first, // voffset
Op.getOperand(6), // soffset
SOffset, // soffset
Offsets.second, // offset
Op.getOperand(7), // format
Op.getOperand(8), // cachepolicy, swizzled buffer
Expand All @@ -8920,13 +8948,14 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
VData = handleD16VData(VData, DAG);
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
SDValue Ops[] = {
Chain,
VData, // vdata
Rsrc, // rsrc
DAG.getConstant(0, DL, MVT::i32), // vindex
Offsets.first, // voffset
Op.getOperand(5), // soffset
SOffset, // soffset
Offsets.second, // offset
Op.getOperand(6), // format
Op.getOperand(7), // cachepolicy, swizzled buffer
Expand Down Expand Up @@ -9000,13 +9029,14 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,

SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
SDValue Ops[] = {
Chain,
VData,
Rsrc,
DAG.getConstant(0, DL, MVT::i32), // vindex
Offsets.first, // voffset
Op.getOperand(5), // soffset
SOffset, // soffset
Offsets.second, // offset
Op.getOperand(6), // cachepolicy, swizzled buffer
DAG.getTargetConstant(0, DL, MVT::i1), // idxen
Expand Down Expand Up @@ -9050,13 +9080,14 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,

auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
SDValue Ops[] = {
Chain,
VData,
Rsrc,
Op.getOperand(4), // vindex
Offsets.first, // voffset
Op.getOperand(6), // soffset
SOffset, // soffset
Offsets.second, // offset
Op.getOperand(7), // cachepolicy, swizzled buffer
DAG.getTargetConstant(1, DL, MVT::i1), // idxen
Expand Down Expand Up @@ -9404,8 +9435,13 @@ void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
return;
}
}

SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
? DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32)
: DAG.getConstant(0, DL, MVT::i32);

Offsets[0] = CombinedOffset;
Offsets[1] = DAG.getConstant(0, DL, MVT::i32);
Offsets[1] = SOffsetZero;
Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
}

Expand Down Expand Up @@ -9663,7 +9699,8 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
if (!Op->isDivergent() && Alignment >= Align(4) && NumElements < 32) {
if (MemVT.isPow2VectorType())
if (MemVT.isPow2VectorType() ||
(Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
return SDValue();
return WidenOrSplitVectorLoad(Op, DAG);
}
Expand All @@ -9679,7 +9716,8 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() &&
Load->isSimple() && isMemOpHasNoClobberedMemOperand(Load) &&
Alignment >= Align(4) && NumElements < 32) {
if (MemVT.isPow2VectorType())
if (MemVT.isPow2VectorType() ||
(Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
return SDValue();
return WidenOrSplitVectorLoad(Op, DAG);
}
Expand Down

0 comments on commit a278ac5

Please sign in to comment.