Skip to content

Commit

Permalink
[AMDGPU] Add GFX11 ds_bvh_stack_rtn_b32 instruction
Browse files Browse the repository at this point in the history
Differential Revision: https://reviews.llvm.org/D133928
  • Loading branch information
jayfoad committed Sep 15, 2022
1 parent f97cc6b commit 3822a01
Show file tree
Hide file tree
Showing 12 changed files with 172 additions and 2 deletions.
12 changes: 12 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -2004,6 +2004,18 @@ def int_amdgcn_ds_sub_gs_reg_rtn :
Intrinsic<[llvm_anyint_ty], [llvm_i32_ty, llvm_i32_ty],
[ImmArg<ArgIndex<1>>, IntrHasSideEffects, IntrWillReturn]>;

def int_amdgcn_ds_bvh_stack_rtn :
Intrinsic<
[llvm_i32_ty, llvm_i32_ty], // %vdst, %addr
[
llvm_i32_ty, // %addr
llvm_i32_ty, // %data0
llvm_v4i32_ty, // %data1
llvm_i32_ty, // %offset
],
[ImmArg<ArgIndex<3>>, IntrWillReturn]
>;

// WMMA (Wave Matrix Multiply-Accumulate) intrinsics
//
// These operations perform a matrix multiplication and accumulation of
Expand Down
19 changes: 17 additions & 2 deletions llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ using namespace llvm;
//===----------------------------------------------------------------------===//

namespace {

static SDValue stripBitcast(SDValue Val) {
return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;
}
Expand Down Expand Up @@ -96,7 +95,7 @@ static SDValue stripExtractLoElt(SDValue In) {
return In;
}

} // end anonymous namespace
} // end anonymous namespace

INITIALIZE_PASS_BEGIN(AMDGPUDAGToDAGISel, "amdgpu-isel",
"AMDGPU DAG->DAG Pattern Instruction Selection", false, false)
Expand Down Expand Up @@ -2380,6 +2379,19 @@ void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {
CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
}

// We need to handle this here because tablegen doesn't support matching
// instructions with multiple outputs.
void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode *N) {
unsigned Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
SDValue Ops[] = {N->getOperand(2), N->getOperand(3), N->getOperand(4),
N->getOperand(5), N->getOperand(0)};

MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
MachineMemOperand *MMO = M->getMemOperand();
SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
}

static unsigned gwsIntrinToOpcode(unsigned IntrID) {
switch (IntrID) {
case Intrinsic::amdgcn_ds_gws_init:
Expand Down Expand Up @@ -2532,6 +2544,9 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
SelectDSAppendConsume(N, IntrID);
return;
}
case Intrinsic::amdgcn_ds_bvh_stack_rtn:
SelectDSBvhStackIntrinsic(N);
return;
}

SelectCode(N);
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,7 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
void SelectBRCOND(SDNode *N);
void SelectFMAD_FMA(SDNode *N);
void SelectDSAppendConsume(SDNode *N, unsigned IntrID);
void SelectDSBvhStackIntrinsic(SDNode *N);
void SelectDS_GWS(SDNode *N, unsigned IntrID);
void SelectInterpP1F16(SDNode *N);
void SelectINTRINSIC_W_CHAIN(SDNode *N);
Expand Down
29 changes: 29 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1803,6 +1803,33 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
return true;
}

// We need to handle this here because tablegen doesn't support matching
// instructions with multiple outputs.
bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic(
MachineInstr &MI) const {
Register Dst0 = MI.getOperand(0).getReg();
Register Dst1 = MI.getOperand(1).getReg();

const DebugLoc &DL = MI.getDebugLoc();
MachineBasicBlock *MBB = MI.getParent();

Register Addr = MI.getOperand(3).getReg();
Register Data0 = MI.getOperand(4).getReg();
Register Data1 = MI.getOperand(5).getReg();
unsigned Offset = MI.getOperand(6).getImm();

auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_BVH_STACK_RTN_B32), Dst0)
.addDef(Dst1)
.addUse(Addr)
.addUse(Data0)
.addUse(Data1)
.addImm(Offset)
.cloneMemRefs(MI);

MI.eraseFromParent();
return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
}

bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
MachineInstr &I) const {
unsigned IntrinsicID = I.getIntrinsicID();
Expand Down Expand Up @@ -1841,6 +1868,8 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
return false;
}
break;
case Intrinsic::amdgcn_ds_bvh_stack_rtn:
return selectDSBvhStackIntrinsic(I);
}
return selectImpl(I, *CoverageInfo);
}
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
bool selectDSGWSIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const;
bool selectDSAppendConsume(MachineInstr &MI, bool IsAppend) const;
bool selectSBarrier(MachineInstr &MI) const;
bool selectDSBvhStackIntrinsic(MachineInstr &MI) const;

bool selectImageIntrinsic(MachineInstr &MI,
const AMDGPU::ImageDimIntrinsicInfo *Intr) const;
Expand Down
14 changes: 14 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4745,6 +4745,20 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
break;
case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
OpdsMapping[0] =
getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); // %vdst
OpdsMapping[1] =
getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); // %addr
OpdsMapping[3] =
getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); // %addr
OpdsMapping[4] =
getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); // %data0
OpdsMapping[5] =
getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); // %data1
break;
}

default:
return getInvalidInstructionMapping();
}
Expand Down
9 changes: 9 additions & 0 deletions llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6382,11 +6382,20 @@ void AMDGPUAsmParser::cvtDSOffset01(MCInst &Inst,
void AMDGPUAsmParser::cvtDSImpl(MCInst &Inst, const OperandVector &Operands,
bool IsGdsHardcoded) {
OptionalImmIndexMap OptionalIdx;
const MCInstrDesc &Desc = MII.get(Inst.getOpcode());
AMDGPUOperand::ImmTy OffsetType = AMDGPUOperand::ImmTyOffset;

for (unsigned i = 1, e = Operands.size(); i != e; ++i) {
AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);

auto TiedTo =
Desc.getOperandConstraint(Inst.getNumOperands(), MCOI::TIED_TO);

if (TiedTo != -1) {
assert((unsigned)TiedTo < Inst.getNumOperands());
Inst.addOperand(Inst.getOperand(TiedTo));
}

// Add the register arguments
if (Op.isReg()) {
Op.addRegOperands(Inst, 1);
Expand Down
15 changes: 15 additions & 0 deletions llvm/lib/Target/AMDGPU/DSInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,19 @@ multiclass DS_1A2D_Off8_RET_mc<string opName,
}
}

class DS_BVH_STACK<string opName>
: DS_Pseudo<opName,
(outs getLdStRegisterOperand<VGPR_32>.ret:$vdst, VGPR_32:$addr),
(ins VGPR_32:$addr_in, getLdStRegisterOperand<VGPR_32>.ret:$data0, VReg_128:$data1, offset:$offset),
" $vdst, $addr, $data0, $data1$offset"> {
let Constraints = "$addr = $addr_in";
let DisableEncoding = "$addr_in";
let has_gds = 0;
let gdsValue = 0;
// TODO: Use MMOs in the LDS address space instead of hasSideEffects = 1.
let hasSideEffects = 1;
let SchedRW = [WriteLDS, WriteLDS];
}

class DS_1A_RET<string opName, RegisterClass rc = VGPR_32, bit HasTiedOutput = 0, Operand ofs = offset,
RegisterOperand data_op = getLdStRegisterOperand<rc>.ret>
Expand Down Expand Up @@ -713,6 +726,7 @@ let SubtargetPredicate = isGFX11Plus in {

def DS_ADD_GS_REG_RTN : DS_0A1D_RET_GDS<"ds_add_gs_reg_rtn", VReg_64, VGPR_32>;
def DS_SUB_GS_REG_RTN : DS_0A1D_RET_GDS<"ds_sub_gs_reg_rtn", VReg_64, VGPR_32>;
def DS_BVH_STACK_RTN_B32 : DS_BVH_STACK<"ds_bvh_stack_rtn_b32">;

} // let SubtargetPredicate = isGFX11Plus

Expand Down Expand Up @@ -1250,6 +1264,7 @@ defm DS_CMPSTORE_RTN_F64 : DS_Real_gfx11<0x071>;
defm DS_ADD_RTN_F32 : DS_Real_gfx11<0x079>;
defm DS_ADD_GS_REG_RTN : DS_Real_gfx11<0x07a>;
defm DS_SUB_GS_REG_RTN : DS_Real_gfx11<0x07b>;
defm DS_BVH_STACK_RTN_B32 : DS_Real_gfx11<0x0ad>;

//===----------------------------------------------------------------------===//
// GFX10.
Expand Down
17 changes: 17 additions & 0 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1161,6 +1161,23 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
MachineMemOperand::MOVolatile;
return true;
}
case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
Info.opc = ISD::INTRINSIC_W_CHAIN;

const GCNTargetMachine &TM =
static_cast<const GCNTargetMachine &>(getTargetMachine());

SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
Info.ptrVal = MFI->getGWSPSV(TM);

// This is an abstract access, but we need to specify a type and size.
Info.memVT = MVT::i32;
Info.size = 4;
Info.align = Align(4);

Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
return true;
}
default:
return false;
}
Expand Down
39 changes: 39 additions & 0 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bvh.stack.rtn.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s

declare { i32, i32 } @llvm.amdgcn.ds.bvh.stack.rtn(i32, i32, <4 x i32>, i32 immarg)

define amdgpu_gs void @test_ds_bvh_stack(i32 %addr, i32 %data0, <4 x i32> %data1, i32 addrspace(1)* %out) {
; CHECK-LABEL: test_ds_bvh_stack:
; CHECK: ; %bb.0:
; CHECK-NEXT: ds_bvh_stack_rtn_b32 v1, v0, v1, v[2:5]
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_add_nc_u32_e32 v0, v1, v0
; CHECK-NEXT: global_store_b32 v[6:7], v0, off
; CHECK-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; CHECK-NEXT: s_endpgm
%pair = call { i32, i32 } @llvm.amdgcn.ds.bvh.stack.rtn(i32 %addr, i32 %data0, <4 x i32> %data1, i32 0)
%vdst = extractvalue { i32, i32 } %pair, 0
%newaddr = extractvalue { i32, i32 } %pair, 1
%res = add i32 %vdst, %newaddr
store i32 %res, i32 addrspace(1)* %out, align 4
ret void
}

define amdgpu_gs void @test_ds_bvh_stack_1(i32 %addr, i32 %data0, <4 x i32> %data1, i32 addrspace(1)* %out) {
; CHECK-LABEL: test_ds_bvh_stack_1:
; CHECK: ; %bb.0:
; CHECK-NEXT: ds_bvh_stack_rtn_b32 v1, v0, v1, v[2:5] offset:1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_add_nc_u32_e32 v0, v1, v0
; CHECK-NEXT: global_store_b32 v[6:7], v0, off
; CHECK-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; CHECK-NEXT: s_endpgm
%pair = call { i32, i32 } @llvm.amdgcn.ds.bvh.stack.rtn(i32 %addr, i32 %data0, <4 x i32> %data1, i32 1)
%vdst = extractvalue { i32, i32 } %pair, 0
%newaddr = extractvalue { i32, i32 } %pair, 1
%res = add i32 %vdst, %newaddr
store i32 %res, i32 addrspace(1)* %out, align 4
ret void
}
6 changes: 6 additions & 0 deletions llvm/test/MC/AMDGPU/gfx11_asm_ds.s
Original file line number Diff line number Diff line change
Expand Up @@ -1961,3 +1961,9 @@ ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,8,7)

ds_swizzle_b32 v8, v2 offset:swizzle(BITMASK_PERM, "01pip")
// GFX11: [0x07,0x09,0xd4,0xd8,0x02,0x00,0x00,0x08]

ds_bvh_stack_rtn_b32 v255, v254, v253, v[249:252]
// GFX11: [0x00,0x00,0xb4,0xda,0xfe,0xfd,0xf9,0xff]

ds_bvh_stack_rtn_b32 v255, v254, v253, v[249:252] offset:127
// GFX11: [0x7f,0x00,0xb4,0xda,0xfe,0xfd,0xf9,0xff]
12 changes: 12 additions & 0 deletions llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_ds.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5166,3 +5166,15 @@

# GFX11: ds_sub_gs_reg_rtn v[1:2], v255 gds ; encoding: [0x00,0x00,0xee,0xd9,0x00,0xff,0x00,0x01]
0x00,0x00,0xee,0xd9,0x00,0xff,0x00,0x01

# GFX11: ds_bvh_stack_rtn_b32 v1, v2, v3, v[4:7] offset:127 ; encoding: [0x7f,0x00,0xb4,0xda,0x02,0x03,0x04,0x01]
0x7f,0x00,0xb4,0xda,0x02,0x03,0x04,0x01

# GFX11: ds_bvh_stack_rtn_b32 v1, v2, v3, v[4:7] ; encoding: [0x00,0x00,0xb4,0xda,0x02,0x03,0x04,0x01]
0x00,0x00,0xb4,0xda,0x02,0x03,0x04,0x01

# GFX11: ds_bvh_stack_rtn_b32 v254, v255, v253, v[5:8] offset:127 ; encoding: [0x7f,0x00,0xb4,0xda,0xff,0xfd,0x05,0xfe]
0x7f,0x00,0xb4,0xda,0xff,0xfd,0x05,0xfe

# GFX11: ds_bvh_stack_rtn_b32 v254, v255, v253, v[5:8] ; encoding: [0x00,0x00,0xb4,0xda,0xff,0xfd,0x05,0xfe]
0x00,0x00,0xb4,0xda,0xff,0xfd,0x05,0xfe

0 comments on commit 3822a01

Please sign in to comment.