Skip to content

Commit

Permalink
AMDGPU: Add offsets to MMO when lowering buffer intrinsics
Browse files Browse the repository at this point in the history
Summary:
Without offsets on the MachineMemOperands (MMOs),
MachineInstr::mayAlias() will return true for all reads and writes to the
same resource descriptor.  This leads to O(N^2) complexity in the MachineScheduler
when analyzing dependencies of buffer loads and stores.  It also limits
the SILoadStoreOptimizer from merging more instructions.

This patch reduces the compile time of one pathological compute shader
from 12 seconds to 1 second.

Reviewers: arsenm, nhaehnle

Reviewed By: arsenm

Subscribers: kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, jfb, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D65097

llvm-svn: 374087
  • Loading branch information
tstellar committed Oct 8, 2019
1 parent fb8218f commit 3a8d809
Show file tree
Hide file tree
Showing 3 changed files with 487 additions and 11 deletions.
78 changes: 69 additions & 9 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6092,6 +6092,30 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
}
}

// This function computes an appropriate offset to pass to
// MachineMemOperand::setOffset() based on the offset inputs to
// an intrinsic. If any of the offsets are non-contstant or
// if VIndex is non-zero then this function returns 0. Otherwise,
// it returns the sum of VOffset, SOffset, and Offset.
static unsigned getBufferOffsetForMMO(SDValue VOffset,
SDValue SOffset,
SDValue Offset,
SDValue VIndex = SDValue()) {

if (!isa<ConstantSDNode>(VOffset) || !isa<ConstantSDNode>(SOffset) ||
!isa<ConstantSDNode>(Offset))
return 0;

if (VIndex) {
if (!isa<ConstantSDNode>(VIndex) || !cast<ConstantSDNode>(VIndex)->isNullValue())
return 0;
}

return cast<ConstantSDNode>(VOffset)->getSExtValue() +
cast<ConstantSDNode>(SOffset)->getSExtValue() +
cast<ConstantSDNode>(Offset)->getSExtValue();
}

SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
SelectionDAG &DAG) const {
unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
Expand Down Expand Up @@ -6238,13 +6262,18 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
};

setBufferOffsets(Op.getOperand(4), DAG, &Ops[3]);
unsigned Offset = setBufferOffsets(Op.getOperand(4), DAG, &Ops[3]);
// We don't know the offset if vindex is non-zero, so clear it.
if (IdxEn)
Offset = 0;

unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ?
AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;

EVT VT = Op.getValueType();
EVT IntVT = VT.changeTypeToInteger();
auto *M = cast<MemSDNode>(Op);
M->getMemOperand()->setOffset(Offset);
EVT LoadVT = Op.getValueType();

if (LoadVT.getScalarType() == MVT::f16)
Expand Down Expand Up @@ -6275,7 +6304,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
DAG.getTargetConstant(0, DL, MVT::i1), // idxen
};

return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
auto *M = cast<MemSDNode>(Op);
M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[3], Ops[4], Ops[5]));
return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
}
case Intrinsic::amdgcn_struct_buffer_load:
case Intrinsic::amdgcn_struct_buffer_load_format: {
Expand All @@ -6293,6 +6324,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
DAG.getTargetConstant(1, DL, MVT::i1), // idxen
};

auto *M = cast<MemSDNode>(Op);
M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[3], Ops[4], Ops[5],
Ops[2]));
return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
}
case Intrinsic::amdgcn_tbuffer_load: {
Expand Down Expand Up @@ -6398,10 +6432,14 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy
DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
};
setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
unsigned Offset = setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
// We don't know the offset if vindex is non-zero, so clear it.
if (IdxEn)
Offset = 0;
EVT VT = Op.getValueType();

auto *M = cast<MemSDNode>(Op);
M->getMemOperand()->setOffset(Offset);
unsigned Opcode = 0;

switch (IntrID) {
Expand Down Expand Up @@ -6469,6 +6507,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
EVT VT = Op.getValueType();

auto *M = cast<MemSDNode>(Op);
M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6]));
unsigned Opcode = 0;

switch (IntrID) {
Expand Down Expand Up @@ -6542,6 +6581,8 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
EVT VT = Op.getValueType();

auto *M = cast<MemSDNode>(Op);
M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6],
Ops[3]));
unsigned Opcode = 0;

switch (IntrID) {
Expand Down Expand Up @@ -6605,9 +6646,13 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy
DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
};
setBufferOffsets(Op.getOperand(6), DAG, &Ops[5]);
unsigned Offset = setBufferOffsets(Op.getOperand(6), DAG, &Ops[5]);
// We don't know the offset if vindex is non-zero, so clear it.
if (IdxEn)
Offset = 0;
EVT VT = Op.getValueType();
auto *M = cast<MemSDNode>(Op);
M->getMemOperand()->setOffset(Offset);

return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
Op->getVTList(), Ops, VT, M->getMemOperand());
Expand All @@ -6628,6 +6673,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
};
EVT VT = Op.getValueType();
auto *M = cast<MemSDNode>(Op);
M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[5], Ops[6], Ops[7]));

return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
Op->getVTList(), Ops, VT, M->getMemOperand());
Expand All @@ -6648,6 +6694,8 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
};
EVT VT = Op.getValueType();
auto *M = cast<MemSDNode>(Op);
M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[5], Ops[6], Ops[7],
Ops[4]));

return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
Op->getVTList(), Ops, VT, M->getMemOperand());
Expand Down Expand Up @@ -6889,11 +6937,15 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
};
setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
unsigned Offset = setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
// We don't know the offset if vindex is non-zero, so clear it.
if (IdxEn)
Offset = 0;
unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store ?
AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
MemSDNode *M = cast<MemSDNode>(Op);
M->getMemOperand()->setOffset(Offset);

// Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
EVT VDataType = VData.getValueType().getScalarType();
Expand Down Expand Up @@ -6938,6 +6990,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
IsFormat ? AMDGPUISD::BUFFER_STORE_FORMAT : AMDGPUISD::BUFFER_STORE;
Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
MemSDNode *M = cast<MemSDNode>(Op);
M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6]));

// Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
Expand Down Expand Up @@ -6982,6 +7035,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
MemSDNode *M = cast<MemSDNode>(Op);
M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6],
Ops[3]));

// Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
EVT VDataType = VData.getValueType().getScalarType();
Expand All @@ -7008,10 +7063,14 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy
DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
};
setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
unsigned Offset = setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
// We don't know the offset if vindex is non-zero, so clear it.
if (IdxEn)
Offset = 0;
EVT VT = Op.getOperand(2).getValueType();

auto *M = cast<MemSDNode>(Op);
M->getMemOperand()->setOffset(Offset);
unsigned Opcode = VT.isVector() ? AMDGPUISD::BUFFER_ATOMIC_PK_FADD
: AMDGPUISD::BUFFER_ATOMIC_FADD;

Expand Down Expand Up @@ -7105,7 +7164,7 @@ std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets(
// Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the
// three offsets (voffset, soffset and instoffset) into the SDValue[3] array
// pointed to by Offsets.
void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
unsigned SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
SelectionDAG &DAG, SDValue *Offsets,
unsigned Align) const {
SDLoc DL(CombinedOffset);
Expand All @@ -7116,7 +7175,7 @@ void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
return;
return SOffset + ImmOffset;
}
}
if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
Expand All @@ -7129,12 +7188,13 @@ void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
Offsets[0] = N0;
Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
return;
return 0;
}
}
Offsets[0] = CombinedOffset;
Offsets[1] = DAG.getConstant(0, DL, MVT::i32);
Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
return 0;
}

// Handle 8 bit and 16 bit buffer loads
Expand Down
6 changes: 4 additions & 2 deletions llvm/lib/Target/AMDGPU/SIISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -203,8 +203,10 @@ class SITargetLowering final : public AMDGPUTargetLowering {
// Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the
// three offsets (voffset, soffset and instoffset) into the SDValue[3] array
// pointed to by Offsets.
void setBufferOffsets(SDValue CombinedOffset, SelectionDAG &DAG,
SDValue *Offsets, unsigned Align = 4) const;
/// \returns 0 If there is a non-constant offset or if the offset is 0.
/// Otherwise returns the constant offset.
unsigned setBufferOffsets(SDValue CombinedOffset, SelectionDAG &DAG,
SDValue *Offsets, unsigned Align = 4) const;

// Handle 8 bit and 16 bit buffer loads
SDValue handleByteShortBufferLoads(SelectionDAG &DAG, EVT LoadVT, SDLoc DL,
Expand Down
Loading

0 comments on commit 3a8d809

Please sign in to comment.