diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index e562ef04a3019..bd540bebd3731 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -64,6 +64,7 @@ BUILTIN(__builtin_amdgcn_s_barrier, "v", "n") BUILTIN(__builtin_amdgcn_wave_barrier, "v", "n") BUILTIN(__builtin_amdgcn_sched_barrier, "vIi", "n") BUILTIN(__builtin_amdgcn_sched_group_barrier, "vIiIiIi", "n") +BUILTIN(__builtin_amdgcn_sched_group_barrier_inst, "vcC*IiIi", "n") BUILTIN(__builtin_amdgcn_iglp_opt, "vIi", "n") BUILTIN(__builtin_amdgcn_s_dcache_inv, "v", "n") BUILTIN(__builtin_amdgcn_buffer_wbinvl1, "v", "n") diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 1ed35befe1361..efcb0e80a4eb5 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -18149,6 +18149,22 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, CGM.getIntrinsic(Intrinsic::amdgcn_s_sendmsg_rtn, {ResultType}); return Builder.CreateCall(F, {Arg}); } + case AMDGPU::BI__builtin_amdgcn_sched_group_barrier_inst: { + StringRef InstrStr; + llvm::getConstantStringInfo(EmitScalarExpr(E->getArg(0)), InstrStr); + + llvm::MDBuilder MDHelper(getLLVMContext()); + + MDNode *InfoTuple = + MDTuple::get(getLLVMContext(), {MDHelper.createString(InstrStr)}); + auto MDV = MetadataAsValue::get(getLLVMContext(), InfoTuple); + + Function *F = + CGM.getIntrinsic(Intrinsic::amdgcn_sched_group_barrier_inst, {}); + llvm::Value *Src1 = EmitScalarExpr(E->getArg(1)); + llvm::Value *Src2 = EmitScalarExpr(E->getArg(2)); + return Builder.CreateCall(F, {MDV, Src1, Src2}); + } default: return nullptr; } diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl index 0bc9a54682d3e..d43a47746cf0d 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl @@ -436,6 +436,19 @@ void test_sched_group_barrier() __builtin_amdgcn_sched_group_barrier(15, 10000, -1); } +// CHECK-LABEL: @test_sched_group_barrier_inst +// CHECK: call void @llvm.amdgcn.sched.group.barrier.inst(metadata !16, i32 1, i32 2) +// CHECK: call void @llvm.amdgcn.sched.group.barrier.inst(metadata !17, i32 3, i32 1) +// CHECK: call void @llvm.amdgcn.sched.group.barrier.inst(metadata !16, i32 1000, i32 -1) +// CHECK: call void @llvm.amdgcn.sched.group.barrier.inst(metadata !18, i32 1, i32 1) +void test_sched_group_barrier_inst() +{ + __builtin_amdgcn_sched_group_barrier_inst("ds_r",1,2); + __builtin_amdgcn_sched_group_barrier_inst("v_cvt",3,1); + __builtin_amdgcn_sched_group_barrier_inst("ds_r",1000,-1); + __builtin_amdgcn_sched_group_barrier_inst("1",1,1); +} + // CHECK-LABEL: @test_iglp_opt // CHECK: call void @llvm.amdgcn.iglp.opt(i32 0) // CHECK: call void @llvm.amdgcn.iglp.opt(i32 1) diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index e5596258847f9..fd8b4581d97c8 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -302,6 +302,17 @@ def int_amdgcn_sched_group_barrier : ClangBuiltin<"__builtin_amdgcn_sched_group_ [ImmArg>, ImmArg>, ImmArg>, IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; + +// Similar to int_amdgcn_sched_group_barrier, except that scheduling group inclusion is specified using +// a string, which is passed to the intrinsic via metadata. This string will be matched against the +// name of the instruction to determine inclusion into scheduling groups. The other parameters are +// the same as above. +def int_amdgcn_sched_group_barrier_inst : + Intrinsic<[], [llvm_metadata_ty, llvm_i32_ty, llvm_i32_ty], + [ImmArg>, ImmArg>, IntrNoMem, IntrHasSideEffects, + IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; + + // Scheduler optimization hint. // MASK = 0: Small gemm opt def int_amdgcn_iglp_opt : ClangBuiltin<"__builtin_amdgcn_iglp_opt">, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 0dbcaf5a1b136..1373a3cd67174 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -5426,6 +5426,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(LDS) NODE_NAME_CASE(FPTRUNC_ROUND_UPWARD) NODE_NAME_CASE(FPTRUNC_ROUND_DOWNWARD) + NODE_NAME_CASE(SCHED_GROUP_BARRIER_INST) NODE_NAME_CASE(DUMMY_CHAIN) case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break; NODE_NAME_CASE(LOAD_D16_HI) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index 827fb106b5519..4aca9be5de78f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -539,6 +539,8 @@ enum NodeType : unsigned { FPTRUNC_ROUND_UPWARD, FPTRUNC_ROUND_DOWNWARD, + SCHED_GROUP_BARRIER_INST, + DUMMY_CHAIN, FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE, LOAD_D16_HI, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp index c24d39b9e5fdd..7b761ba79d21b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -247,6 +247,21 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) { return; } + if (MI->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER_INST) { + if (isVerbose()) { + MI->getMF(); + AMDGPUMachineFunction *MFI = MF->getInfo(); + auto MDI = MFI->getMDInfo(); + auto MD = MDI->getMetadataFromIndex(MI->getOperand(0).getImm()); + auto InstString = cast(MD)->getString(); + OutStreamer->emitRawComment( + " sched_group_barrier prefix_string(" + InstString + ") size(" + + Twine(MI->getOperand(1).getImm()) + ") SyncID(" + + Twine(MI->getOperand(2).getImm()) + ")"); + } + return; + } + if (MI->getOpcode() == AMDGPU::IGLP_OPT) { if (isVerbose()) { std::string HexString; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h index 7efb7f825348e..b715c2b149c83 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h @@ -21,6 +21,27 @@ namespace llvm { class AMDGPUSubtarget; +class AMDGPUMetadataInfo { +private: + unsigned Index = 0; + DenseMap MetadataToIndex; + DenseMap IndexToMetadata; + +public: + AMDGPUMetadataInfo() = default; + unsigned addOrGetMetadataIndex(Metadata *MD) { + if (MetadataToIndex.contains(MD)) + return MetadataToIndex[MD]; + + MetadataToIndex[MD] = Index; + IndexToMetadata[Index] = MD; + return Index++; + } + Metadata *getMetadataFromIndex(unsigned Index) { + return IndexToMetadata[Index]; + } +}; + class AMDGPUMachineFunction : public MachineFunctionInfo { /// A map to keep track of local memory objects and their offsets within the /// local memory space. @@ -67,6 +88,8 @@ class AMDGPUMachineFunction : public MachineFunctionInfo { // Kernel may need limited waves per EU for better performance. bool WaveLimiter = false; + AMDGPUMetadataInfo MDInfo; + public: AMDGPUMachineFunction(const Function &F, const AMDGPUSubtarget &ST); @@ -113,6 +136,8 @@ class AMDGPUMachineFunction : public MachineFunctionInfo { return allocateLDSGlobal(DL, GV, DynLDSAlign); } + AMDGPUMetadataInfo *getMDInfo() { return &MDInfo; } + unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV, Align Trailing); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 6ddc7e864fb23..2f958ddc25ba5 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -9615,6 +9615,24 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, auto NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops); return SDValue(NewMI, 0); } + + case Intrinsic::amdgcn_sched_group_barrier_inst: { + if (auto MSOP = dyn_cast(Op.getOperand(2))) { + const MDNode *Metadata = MSOP->getMD(); + auto MFI = MF.getInfo(); + auto MDI = MFI->getMDInfo(); + auto MDIndex = MDI->addOrGetMetadataIndex(Metadata->getOperand(0)); + SmallVector Ops = { + Op.getOperand(0), DAG.getTargetConstant(MDIndex, DL, MVT::i32), + Op.getOperand(3), Op.getOperand(4)}; + return DAG.getNode(AMDGPUISD::SCHED_GROUP_BARRIER_INST, DL, + Op.getValueType(), Ops); + } + + // Fail to legalize + return Op; + } + default: { if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrinsicID)) diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index b4bd46d33c1f1..52f24180af863 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -402,6 +402,24 @@ def SCHED_GROUP_BARRIER : SPseudoInstSI< let isMeta = 1; } +def AMDGPUSGBI : SDNode<"AMDGPUISD::SCHED_GROUP_BARRIER_INST", SDTypeProfile<0, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, i32>]>, [SDNPHasChain]>; + +def SCHED_GROUP_BARRIER_INST : SPseudoInstSI< + (outs), + (ins i32imm:$idx, i32imm:$size, i32imm:$syncid), + [(AMDGPUSGBI (i32 timm:$idx), (i32 timm:$size), (i32 timm:$syncid))]> { + let SchedRW = []; + let SchedRW = []; + let hasNoSchedulingInfo = 1; + let hasSideEffects = 1; + let mayLoad = 0; + let mayStore = 0; + let isConvergent = 1; + let FixedSize = 1; + let Size = 0; + let isMeta = 1; +} + def IGLP_OPT : SPseudoInstSI<(outs), (ins i32imm:$mask), [(int_amdgcn_iglp_opt (i32 timm:$mask))]> { let SchedRW = []; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.inst.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.inst.ll new file mode 100644 index 0000000000000..93cd8103bd60e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.inst.ll @@ -0,0 +1,34 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -misched-cluster=0 < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -misched-cluster=0 -amdgpu-igrouplp-exact-solver-max-branches=250000 < %s | FileCheck -check-prefix=EXACTCUTOFF %s + +define amdgpu_kernel void @test_sched_group_barrier() { +; GCN-LABEL: test_sched_group_barrier: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: ; sched_group_barrier prefix_string(ds_r) size(1) SyncID(2) +; GCN-NEXT: ; sched_group_barrier prefix_string(v_cvt) size(3) SyncID(1) +; GCN-NEXT: ; sched_group_barrier prefix_string(ds_r) size(1000) SyncID(-1) +; GCN-NEXT: ; sched_group_barrier prefix_string(1) size(1) SyncID(1) +; GCN-NEXT: s_endpgm +; +; EXACTCUTOFF-LABEL: test_sched_group_barrier: +; EXACTCUTOFF: ; %bb.0: ; %entry +; EXACTCUTOFF-NEXT: ; sched_group_barrier prefix_string(ds_r) size(1) SyncID(2) +; EXACTCUTOFF-NEXT: ; sched_group_barrier prefix_string(v_cvt) size(3) SyncID(1) +; EXACTCUTOFF-NEXT: ; sched_group_barrier prefix_string(ds_r) size(1000) SyncID(-1) +; EXACTCUTOFF-NEXT: ; sched_group_barrier prefix_string(1) size(1) SyncID(1) +; EXACTCUTOFF-NEXT: s_endpgm +entry: + tail call void @llvm.amdgcn.sched.group.barrier.inst(metadata !1, i32 1, i32 2) + tail call void @llvm.amdgcn.sched.group.barrier.inst(metadata !2, i32 3, i32 1) + tail call void @llvm.amdgcn.sched.group.barrier.inst(metadata !1, i32 1000, i32 -1) + tail call void @llvm.amdgcn.sched.group.barrier.inst(metadata !3, i32 1, i32 1) + ret void +} + +declare void @llvm.amdgcn.sched.group.barrier.inst(metadata, i32 immarg, i32 immarg) + +!1 = !{!"ds_r"} +!2 = !{!"v_cvt"} +!3 = !{!"1"} +