Skip to content

Commit

Permalink
[AMDGPU] CodeGen for GFX12 VFLAT, VSCRATCH and VGLOBAL instructions (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
mbrkusanin committed Dec 15, 2023
1 parent 214d32c commit 07a6d73
Show file tree
Hide file tree
Showing 57 changed files with 13,866 additions and 373 deletions.
5 changes: 5 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -2486,6 +2486,11 @@ def int_amdgcn_permlanex16_var : ClangBuiltin<"__builtin_amdgcn_permlanex16_var"
[IntrNoMem, IntrConvergent, IntrWillReturn,
ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree]>;

def int_amdgcn_flat_atomic_fmin_num : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>;
def int_amdgcn_flat_atomic_fmax_num : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>;
def int_amdgcn_global_atomic_fmin_num : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>;
def int_amdgcn_global_atomic_fmax_num : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>;

//===----------------------------------------------------------------------===//
// Deep learning intrinsics.
//===----------------------------------------------------------------------===//
Expand Down
13 changes: 9 additions & 4 deletions llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1166,6 +1166,11 @@ bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Addr) const {
if (isNoUnsignedWrap(Addr))
return true;

// Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
// values.
if (AMDGPU::isGFX12Plus(*Subtarget))
return true;

auto LHS = Addr.getOperand(0);
auto RHS = Addr.getOperand(1);

Expand Down Expand Up @@ -1701,7 +1706,7 @@ bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
}

VAddr = Addr;
Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i16);
Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32);
return true;
}

Expand Down Expand Up @@ -1769,7 +1774,7 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
VOffset = SDValue(VMov, 0);
SAddr = LHS;
Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i16);
Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32);
return true;
}
}
Expand Down Expand Up @@ -1809,7 +1814,7 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
}

if (SAddr) {
Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16);
Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32);
return true;
}
}
Expand All @@ -1825,7 +1830,7 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, SDLoc(Addr), MVT::i32,
CurDAG->getTargetConstant(0, SDLoc(), MVT::i32));
VOffset = SDValue(VMov, 0);
Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16);
Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32);
return true;
}

Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -642,6 +642,10 @@ defm int_amdgcn_global_atomic_fmax : noret_op;
defm int_amdgcn_global_atomic_csub : noret_op;
defm int_amdgcn_flat_atomic_fadd : local_addr_space_atomic_op;
defm int_amdgcn_ds_fadd_v2bf16 : noret_op;
defm int_amdgcn_flat_atomic_fmin_num : noret_op;
defm int_amdgcn_flat_atomic_fmax_num : noret_op;
defm int_amdgcn_global_atomic_fmin_num : noret_op;
defm int_amdgcn_global_atomic_fmax_num : noret_op;

multiclass noret_binary_atomic_op<SDNode atomic_op, bit IsInt = 1> {
let HasNoUse = true in
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4663,9 +4663,13 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_global_atomic_csub:
case Intrinsic::amdgcn_global_atomic_fmin:
case Intrinsic::amdgcn_global_atomic_fmax:
case Intrinsic::amdgcn_global_atomic_fmin_num:
case Intrinsic::amdgcn_global_atomic_fmax_num:
case Intrinsic::amdgcn_flat_atomic_fadd:
case Intrinsic::amdgcn_flat_atomic_fmin:
case Intrinsic::amdgcn_flat_atomic_fmax:
case Intrinsic::amdgcn_flat_atomic_fmin_num:
case Intrinsic::amdgcn_flat_atomic_fmax_num:
case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
return getDefaultMappingAllVGPR(MI);
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
Original file line number Diff line number Diff line change
Expand Up @@ -241,9 +241,13 @@ def : SourceOfDivergence<int_amdgcn_global_atomic_csub>;
def : SourceOfDivergence<int_amdgcn_global_atomic_fadd>;
def : SourceOfDivergence<int_amdgcn_global_atomic_fmin>;
def : SourceOfDivergence<int_amdgcn_global_atomic_fmax>;
def : SourceOfDivergence<int_amdgcn_global_atomic_fmin_num>;
def : SourceOfDivergence<int_amdgcn_global_atomic_fmax_num>;
def : SourceOfDivergence<int_amdgcn_flat_atomic_fadd>;
def : SourceOfDivergence<int_amdgcn_flat_atomic_fmin>;
def : SourceOfDivergence<int_amdgcn_flat_atomic_fmax>;
def : SourceOfDivergence<int_amdgcn_flat_atomic_fmin_num>;
def : SourceOfDivergence<int_amdgcn_flat_atomic_fmax_num>;
def : SourceOfDivergence<int_amdgcn_global_atomic_fadd_v2bf16>;
def : SourceOfDivergence<int_amdgcn_flat_atomic_fadd_v2bf16>;
def : SourceOfDivergence<int_amdgcn_ds_fadd>;
Expand Down
6 changes: 5 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1026,6 +1026,8 @@ bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
case Intrinsic::amdgcn_flat_atomic_fadd:
case Intrinsic::amdgcn_flat_atomic_fmax:
case Intrinsic::amdgcn_flat_atomic_fmin:
case Intrinsic::amdgcn_flat_atomic_fmax_num:
case Intrinsic::amdgcn_flat_atomic_fmin_num:
OpIndexes.push_back(0);
return true;
default:
Expand Down Expand Up @@ -1100,7 +1102,9 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
}
case Intrinsic::amdgcn_flat_atomic_fadd:
case Intrinsic::amdgcn_flat_atomic_fmax:
case Intrinsic::amdgcn_flat_atomic_fmin: {
case Intrinsic::amdgcn_flat_atomic_fmin:
case Intrinsic::amdgcn_flat_atomic_fmax_num:
case Intrinsic::amdgcn_flat_atomic_fmin_num: {
Type *DestTy = II->getType();
Type *SrcTy = NewV->getType();
unsigned NewAS = SrcTy->getPointerAddressSpace();
Expand Down
57 changes: 46 additions & 11 deletions llvm/lib/Target/AMDGPU/FLATInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -1072,19 +1072,43 @@ class FlatStoreSignedAtomicPat <FLAT_Pseudo inst, SDPatternOperator node,
(inst $vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)
>;

multiclass FlatAtomicPat <string inst, string node, ValueType vt,
ValueType data_vt = vt> {
defvar rtnNode = !cast<PatFrags>(node#"_"#vt.Size);
defvar noRtnNode = !cast<PatFrags>(node#"_noret_"#vt.Size);

def : GCNPat <(vt (rtnNode (FlatOffset i64:$vaddr, i32:$offset), data_vt:$data)),
(!cast<FLAT_Pseudo>(inst#"_RTN") VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>;
multiclass FlatAtomicNoRtnPat <string inst, string node, ValueType vt,
ValueType data_vt = vt, bit isIntr = 0> {
defvar noRtnNode = !cast<PatFrags>(node # "_noret" # !if(isIntr, "", "_"#vt.Size));

let AddedComplexity = 1 in
def : GCNPat <(vt (noRtnNode (FlatOffset i64:$vaddr, i32:$offset), data_vt:$data)),
(!cast<FLAT_Pseudo>(inst) VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>;
}

multiclass FlatAtomicRtnPat <string inst, string node, ValueType vt,
ValueType data_vt = vt, bit isIntr = 0> {
defvar rtnNode = !cast<SDPatternOperator>(node # !if(isIntr, "", "_"#vt.Size));

def : GCNPat <(vt (rtnNode (FlatOffset i64:$vaddr, i32:$offset), data_vt:$data)),
(!cast<FLAT_Pseudo>(inst#"_RTN") VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>;
}

multiclass FlatAtomicPat <string inst, string node, ValueType vt,
ValueType data_vt = vt, bit isIntr = 0> :
FlatAtomicRtnPat<inst, node, vt, data_vt, isIntr>,
FlatAtomicNoRtnPat<inst, node, vt, data_vt, isIntr>;

multiclass FlatAtomicIntrNoRtnPat <string inst, string node, ValueType vt,
ValueType data_vt = vt> {
defm : FlatAtomicNoRtnPat<inst, node, vt, data_vt, /* isIntr */ 1>;
}

multiclass FlatAtomicIntrRtnPat <string inst, string node, ValueType vt,
ValueType data_vt = vt> {
defm : FlatAtomicRtnPat<inst, node, vt, data_vt, /* isIntr */ 1>;
}

multiclass FlatAtomicIntrPat <string inst, string node, ValueType vt,
ValueType data_vt = vt> :
FlatAtomicRtnPat<inst, node, vt, data_vt, /* isIntr */ 1>,
FlatAtomicNoRtnPat<inst, node, vt, data_vt, /* isIntr */ 1>;

class FlatSignedAtomicPatBase <FLAT_Pseudo inst, SDPatternOperator node,
ValueType vt, ValueType data_vt = vt> : GCNPat <
(vt (node (GlobalOffset i64:$vaddr, i32:$offset), data_vt:$data)),
Expand Down Expand Up @@ -1305,10 +1329,10 @@ multiclass GlobalFLATStorePats<FLAT_Pseudo inst, SDPatternOperator node,
multiclass GlobalFLATAtomicPatsNoRtnBase<string inst, string node, ValueType vt,
ValueType data_vt = vt> {
let AddedComplexity = 11 in
def : FlatSignedAtomicPatBase<!cast<FLAT_Pseudo>(inst), !cast<PatFrags>(node), vt, data_vt>;
def : FlatSignedAtomicPatBase<!cast<FLAT_Pseudo>(inst), !cast<SDPatternOperator>(node), vt, data_vt>;

let AddedComplexity = 13 in
def : GlobalAtomicSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR"), !cast<PatFrags>(node), vt, data_vt>;
def : GlobalAtomicSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR"), !cast<SDPatternOperator>(node), vt, data_vt>;
}

multiclass GlobalFLATAtomicPatsRtnBase<string inst, string node, ValueType vt,
Expand Down Expand Up @@ -1508,10 +1532,14 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR_X2", "atomic_load_xor_global", i
let OtherPredicates = [isGFX10Plus] in {
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN", "atomic_load_fmin_global", f32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX", "atomic_load_fmax_global", f32>;
defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN", "int_amdgcn_global_atomic_fmin", f32>;
defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX", "int_amdgcn_global_atomic_fmax", f32>;
defm : FlatSignedAtomicPat <"FLAT_ATOMIC_FMIN", "atomic_load_fmin_flat", f32>;
defm : FlatSignedAtomicPat <"FLAT_ATOMIC_FMAX", "atomic_load_fmax_flat", f32>;
}

let OtherPredicates = [isGFX10GFX11] in {
defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN", "int_amdgcn_global_atomic_fmin", f32>;
defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX", "int_amdgcn_global_atomic_fmax", f32>;

defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMIN", "int_amdgcn_flat_atomic_fmin", f32>;
defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMAX", "int_amdgcn_flat_atomic_fmax", f32>;
}
Expand All @@ -1527,6 +1555,13 @@ defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMIN_X2", "int_amdgcn_flat_atomic_f
defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMAX_X2", "int_amdgcn_flat_atomic_fmax", f64>;
}

let OtherPredicates = [isGFX12Only] in {
defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN", "int_amdgcn_global_atomic_fmin_num", f32>;
defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX", "int_amdgcn_global_atomic_fmax_num", f32>;
defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMIN", "int_amdgcn_flat_atomic_fmin_num", f32>;
defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMAX", "int_amdgcn_flat_atomic_fmax_num", f32>;
}

let OtherPredicates = [HasAtomicFaddNoRtnInsts] in {
defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_ADD_F32", "atomic_load_fadd_global", f32>;
defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_ADD_F32", "int_amdgcn_flat_atomic_fadd", "global_addrspace", f32>;
Expand Down
20 changes: 17 additions & 3 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1230,9 +1230,13 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
case Intrinsic::amdgcn_global_atomic_fadd:
case Intrinsic::amdgcn_global_atomic_fmin:
case Intrinsic::amdgcn_global_atomic_fmax:
case Intrinsic::amdgcn_global_atomic_fmin_num:
case Intrinsic::amdgcn_global_atomic_fmax_num:
case Intrinsic::amdgcn_flat_atomic_fadd:
case Intrinsic::amdgcn_flat_atomic_fmin:
case Intrinsic::amdgcn_flat_atomic_fmax:
case Intrinsic::amdgcn_flat_atomic_fmin_num:
case Intrinsic::amdgcn_flat_atomic_fmax_num:
case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
Expand Down Expand Up @@ -1315,6 +1319,8 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
case Intrinsic::amdgcn_flat_atomic_fadd:
case Intrinsic::amdgcn_flat_atomic_fmin:
case Intrinsic::amdgcn_flat_atomic_fmax:
case Intrinsic::amdgcn_flat_atomic_fmin_num:
case Intrinsic::amdgcn_flat_atomic_fmax_num:
case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
case Intrinsic::amdgcn_global_atomic_csub: {
Expand Down Expand Up @@ -8642,8 +8648,12 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
}
case Intrinsic::amdgcn_global_atomic_fmin:
case Intrinsic::amdgcn_global_atomic_fmax:
case Intrinsic::amdgcn_global_atomic_fmin_num:
case Intrinsic::amdgcn_global_atomic_fmax_num:
case Intrinsic::amdgcn_flat_atomic_fmin:
case Intrinsic::amdgcn_flat_atomic_fmax: {
case Intrinsic::amdgcn_flat_atomic_fmax:
case Intrinsic::amdgcn_flat_atomic_fmin_num:
case Intrinsic::amdgcn_flat_atomic_fmax_num: {
MemSDNode *M = cast<MemSDNode>(Op);
SDValue Ops[] = {
M->getOperand(0), // Chain
Expand All @@ -8653,12 +8663,16 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
unsigned Opcode = 0;
switch (IntrID) {
case Intrinsic::amdgcn_global_atomic_fmin:
case Intrinsic::amdgcn_flat_atomic_fmin: {
case Intrinsic::amdgcn_global_atomic_fmin_num:
case Intrinsic::amdgcn_flat_atomic_fmin:
case Intrinsic::amdgcn_flat_atomic_fmin_num: {
Opcode = AMDGPUISD::ATOMIC_LOAD_FMIN;
break;
}
case Intrinsic::amdgcn_global_atomic_fmax:
case Intrinsic::amdgcn_flat_atomic_fmax: {
case Intrinsic::amdgcn_global_atomic_fmax_num:
case Intrinsic::amdgcn_flat_atomic_fmax:
case Intrinsic::amdgcn_flat_atomic_fmax_num: {
Opcode = AMDGPUISD::ATOMIC_LOAD_FMAX;
break;
}
Expand Down
19 changes: 11 additions & 8 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8689,16 +8689,13 @@ bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
AddrSpace == AMDGPUAS::GLOBAL_ADDRESS))
return false;

bool AllowNegative = FlatVariant != SIInstrFlags::FLAT;
if (ST.hasNegativeScratchOffsetBug() &&
FlatVariant == SIInstrFlags::FlatScratch)
AllowNegative = false;
if (ST.hasNegativeUnalignedScratchOffsetBug() &&
FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 &&
(Offset % 4) != 0) {
return false;
}

bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
unsigned N = AMDGPU::getNumFlatOffsetBits(ST);
return isIntN(N, Offset) && (AllowNegative || Offset >= 0);
}
Expand All @@ -8709,12 +8706,10 @@ SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
uint64_t FlatVariant) const {
int64_t RemainderOffset = COffsetVal;
int64_t ImmField = 0;
bool AllowNegative = FlatVariant != SIInstrFlags::FLAT;
if (ST.hasNegativeScratchOffsetBug() &&
FlatVariant == SIInstrFlags::FlatScratch)
AllowNegative = false;

bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST) - 1;

if (AllowNegative) {
// Use signed division by a power of two to truncate towards 0.
int64_t D = 1LL << NumBits;
Expand All @@ -8738,6 +8733,14 @@ SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
return {ImmField, RemainderOffset};
}

bool SIInstrInfo::allowNegativeFlatOffset(uint64_t FlatVariant) const {
if (ST.hasNegativeScratchOffsetBug() &&
FlatVariant == SIInstrFlags::FlatScratch)
return false;

return FlatVariant != SIInstrFlags::FLAT || AMDGPU::isGFX12Plus(ST);
}

static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) {
switch (ST.getGeneration()) {
default:
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -1294,6 +1294,9 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
unsigned AddrSpace,
uint64_t FlatVariant) const;

/// Returns true if negative offsets are allowed for the given \p FlatVariant.
bool allowNegativeFlatOffset(uint64_t FlatVariant) const;

/// \brief Return a target-specific opcode if Opcode is a pseudo instruction.
/// Return -1 if the target-specific opcode for the pseudo instruction does
/// not exist. If Opcode is not a pseudo instruction, this is identity.
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX940 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX11 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX11 %s

define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_intrinsic(ptr %ptr, float %data) {
; GFX940-LABEL: name: flat_atomic_fadd_f32_no_rtn_intrinsic
Expand Down

0 comments on commit 07a6d73

Please sign in to comment.