-
Notifications
You must be signed in to change notification settings - Fork 10.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU] GFX12: Add Split Workgroup Barrier #74836
Merged
mariusz-sikora-at-amd
merged 7 commits into
llvm:main
from
mariusz-sikora-at-amd:masikora/upstream-gfx12-sh-split-workgroup-barrier
Dec 13, 2023
Merged
[AMDGPU] GFX12: Add Split Workgroup Barrier #74836
mariusz-sikora-at-amd
merged 7 commits into
llvm:main
from
mariusz-sikora-at-amd:masikora/upstream-gfx12-sh-split-workgroup-barrier
Dec 13, 2023
Conversation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
mariusz-sikora-at-amd
requested review from
jayfoad,
vangthao95,
piotrAMD and
mbrkusanin
December 8, 2023 13:36
llvmbot
added
clang
Clang issues not falling into any other category
backend:AMDGPU
clang:frontend
Language frontend issues, e.g. anything involving "Sema"
mc
Machine (object) code
llvm:ir
labels
Dec 8, 2023
@llvm/pr-subscribers-mc @llvm/pr-subscribers-clang Author: Mariusz Sikora (mariusz-sikora-at-amd) ChangesPatch is 112.63 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/74836.diff 27 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 8b59b3790d7bc6..7465f13d552d6e 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -406,5 +406,21 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32, "iffiIb", "nc", "fp8-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_bf8_f32, "ifiiIi", "nc", "fp8-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32, "ifiiIi", "nc", "fp8-insts")
+//===----------------------------------------------------------------------===//
+// GFX12+ only builtins.
+//===----------------------------------------------------------------------===//
+
+TARGET_BUILTIN(__builtin_amdgcn_s_barrier_signal, "vIi", "n", "gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_s_barrier_signal_var, "vi", "n", "gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_s_barrier_wait, "vIs", "n", "gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_s_barrier_signal_isfirst, "bIi", "n", "gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_s_barrier_signal_isfirst_var, "bi", "n", "gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_s_barrier_init, "vii", "n", "gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_s_barrier_join, "vi", "n", "gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_s_wakeup_barrier, "vi", "n", "gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_s_barrier_leave, "b", "n", "gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_s_get_barrier_state, "Uii", "n", "gfx12-insts")
+
+
#undef BUILTIN
#undef TARGET_BUILTIN
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-err.cl
new file mode 100644
index 00000000000000..5e0153c42825e3
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-err.cl
@@ -0,0 +1,24 @@
+// REQUIRES: amdgpu-registered-target
+
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -verify -S -emit-llvm -o - %s
+
+kernel void builtins_amdgcn_s_barrier_signal_err(global int* in, global int* out, int barrier) {
+
+ __builtin_amdgcn_s_barrier_signal(barrier); // expected-error {{'__builtin_amdgcn_s_barrier_signal' must be a constant integer}}
+ __builtin_amdgcn_s_barrier_wait(-1);
+ *out = *in;
+}
+
+kernel void builtins_amdgcn_s_barrier_wait_err(global int* in, global int* out, int barrier) {
+
+ __builtin_amdgcn_s_barrier_signal(-1);
+ __builtin_amdgcn_s_barrier_wait(barrier); // expected-error {{'__builtin_amdgcn_s_barrier_wait' must be a constant integer}}
+ *out = *in;
+}
+
+kernel void builtins_amdgcn_s_barrier_signal_isfirst_err(global int* in, global int* out, int barrier) {
+
+ __builtin_amdgcn_s_barrier_signal_isfirst(barrier); // expected-error {{'__builtin_amdgcn_s_barrier_signal_isfirst' must be a constant integer}}
+ __builtin_amdgcn_s_barrier_wait(-1);
+ *out = *in;
+}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl
new file mode 100644
index 00000000000000..b8d281531e218e
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl
@@ -0,0 +1,174 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -S -emit-llvm -o - %s | FileCheck %s
+
+// CHECK-LABEL: @test_s_barrier_signal(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal(i32 -1)
+// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -1)
+// CHECK-NEXT: ret void
+//
+void test_s_barrier_signal()
+{
+ __builtin_amdgcn_s_barrier_signal(-1);
+ __builtin_amdgcn_s_barrier_wait(-1);
+}
+
+// CHECK-LABEL: @test_s_barrier_signal_var(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: store i32 [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4
+// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(i32 [[TMP0]])
+// CHECK-NEXT: ret void
+//
+void test_s_barrier_signal_var(int a)
+{
+ __builtin_amdgcn_s_barrier_signal_var(a);
+}
+
+// CHECK-LABEL: @test_s_barrier_signal_isfirst(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT: store ptr [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 8
+// CHECK-NEXT: store ptr [[B:%.*]], ptr addrspace(5) [[B_ADDR]], align 8
+// CHECK-NEXT: store ptr [[C:%.*]], ptr addrspace(5) [[C_ADDR]], align 8
+// CHECK-NEXT: [[TMP0:%.*]] = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 1)
+// CHECK-NEXT: br i1 [[TMP0]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+// CHECK: if.then:
+// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr addrspace(5) [[B_ADDR]], align 8
+// CHECK-NEXT: store ptr [[TMP1]], ptr addrspace(5) [[A_ADDR]], align 8
+// CHECK-NEXT: br label [[IF_END:%.*]]
+// CHECK: if.else:
+// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8
+// CHECK-NEXT: store ptr [[TMP2]], ptr addrspace(5) [[A_ADDR]], align 8
+// CHECK-NEXT: br label [[IF_END]]
+// CHECK: if.end:
+// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1)
+// CHECK-NEXT: ret void
+//
+void test_s_barrier_signal_isfirst(int* a, int* b, int *c)
+{
+ if(__builtin_amdgcn_s_barrier_signal_isfirst(1))
+ a = b;
+ else
+ a = c;
+
+ __builtin_amdgcn_s_barrier_wait(1);
+}
+
+// CHECK-LABEL: @test_s_barrier_isfirst_var(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT: [[D_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: store ptr [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 8
+// CHECK-NEXT: store ptr [[B:%.*]], ptr addrspace(5) [[B_ADDR]], align 8
+// CHECK-NEXT: store ptr [[C:%.*]], ptr addrspace(5) [[C_ADDR]], align 8
+// CHECK-NEXT: store i32 [[D:%.*]], ptr addrspace(5) [[D_ADDR]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[D_ADDR]], align 4
+// CHECK-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.s.barrier.signal.isfirst.var(i32 [[TMP0]])
+// CHECK-NEXT: br i1 [[TMP1]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+// CHECK: if.then:
+// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr addrspace(5) [[B_ADDR]], align 8
+// CHECK-NEXT: store ptr [[TMP2]], ptr addrspace(5) [[A_ADDR]], align 8
+// CHECK-NEXT: br label [[IF_END:%.*]]
+// CHECK: if.else:
+// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8
+// CHECK-NEXT: store ptr [[TMP3]], ptr addrspace(5) [[A_ADDR]], align 8
+// CHECK-NEXT: br label [[IF_END]]
+// CHECK: if.end:
+// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1)
+// CHECK-NEXT: ret void
+//
+void test_s_barrier_isfirst_var(int* a, int* b, int *c, int d)
+{
+ if ( __builtin_amdgcn_s_barrier_signal_isfirst_var(d))
+ a = b;
+ else
+ a = c;
+
+ __builtin_amdgcn_s_barrier_wait(1);
+
+}
+
+// CHECK-LABEL: @test_s_barrier_init(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: store i32 [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4
+// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.init(i32 1, i32 [[TMP0]])
+// CHECK-NEXT: ret void
+//
+void test_s_barrier_init(int a)
+{
+ __builtin_amdgcn_s_barrier_init(1, a);
+}
+
+// CHECK-LABEL: @test_s_barrier_join(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(i32 1)
+// CHECK-NEXT: ret void
+//
+void test_s_barrier_join()
+{
+ __builtin_amdgcn_s_barrier_join(1);
+}
+
+// CHECK-LABEL: @test_s_wakeup_barrier(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(i32 1)
+// CHECK-NEXT: ret void
+//
+void test_s_wakeup_barrier()
+{
+ __builtin_amdgcn_s_barrier_join(1);
+}
+
+// CHECK-LABEL: @test_s_barrier_leave(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT: store ptr [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 8
+// CHECK-NEXT: store ptr [[B:%.*]], ptr addrspace(5) [[B_ADDR]], align 8
+// CHECK-NEXT: store ptr [[C:%.*]], ptr addrspace(5) [[C_ADDR]], align 8
+// CHECK-NEXT: [[TMP0:%.*]] = call i1 @llvm.amdgcn.s.barrier.leave()
+// CHECK-NEXT: br i1 [[TMP0]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+// CHECK: if.then:
+// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr addrspace(5) [[B_ADDR]], align 8
+// CHECK-NEXT: store ptr [[TMP1]], ptr addrspace(5) [[A_ADDR]], align 8
+// CHECK-NEXT: br label [[IF_END:%.*]]
+// CHECK: if.else:
+// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8
+// CHECK-NEXT: store ptr [[TMP2]], ptr addrspace(5) [[A_ADDR]], align 8
+// CHECK-NEXT: br label [[IF_END]]
+// CHECK: if.end:
+// CHECK-NEXT: ret void
+//
+void test_s_barrier_leave(int* a, int* b, int *c)
+{
+ if (__builtin_amdgcn_s_barrier_leave())
+ a = b;
+ else
+ a = c;
+}
+
+// CHECK-LABEL: @test_s_get_barrier_state(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[STATE:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: store i32 [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4
+// CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.s.get.barrier.state(i32 [[TMP0]])
+// CHECK-NEXT: store i32 [[TMP1]], ptr addrspace(5) [[STATE]], align 4
+// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[STATE]], align 4
+// CHECK-NEXT: ret i32 [[TMP2]]
+//
+unsigned test_s_get_barrier_state(int a)
+{
+ unsigned State = __builtin_amdgcn_s_get_barrier_state(a);
+ return State;
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index bc9f99783d98f2..09e88152e65d2a 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -227,6 +227,45 @@ def int_amdgcn_s_sendmsg_rtn : Intrinsic <[llvm_anyint_ty], [llvm_i32_ty],
def int_amdgcn_s_barrier : ClangBuiltin<"__builtin_amdgcn_s_barrier">,
Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
+def int_amdgcn_s_barrier_signal : ClangBuiltin<"__builtin_amdgcn_s_barrier_signal">,
+ Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
+ IntrNoCallback, IntrNoFree]>;
+
+def int_amdgcn_s_barrier_signal_var : ClangBuiltin<"__builtin_amdgcn_s_barrier_signal_var">,
+ Intrinsic<[], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
+ IntrNoCallback, IntrNoFree]>;
+
+def int_amdgcn_s_barrier_signal_isfirst : ClangBuiltin<"__builtin_amdgcn_s_barrier_signal_isfirst">,
+ Intrinsic<[llvm_i1_ty], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent,
+ IntrWillReturn, IntrNoCallback, IntrNoFree]>;
+
+def int_amdgcn_s_barrier_signal_isfirst_var : ClangBuiltin<"__builtin_amdgcn_s_barrier_signal_isfirst_var">,
+ Intrinsic<[llvm_i1_ty], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
+ IntrNoCallback, IntrNoFree]>;
+
+def int_amdgcn_s_barrier_init : ClangBuiltin<"__builtin_amdgcn_s_barrier_init">,
+ Intrinsic<[], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent,
+ IntrWillReturn, IntrNoCallback, IntrNoFree]>;
+
+def int_amdgcn_s_barrier_join : ClangBuiltin<"__builtin_amdgcn_s_barrier_join">,
+ Intrinsic<[], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
+ IntrNoCallback, IntrNoFree]>;
+
+def int_amdgcn_s_wakeup_barrier : ClangBuiltin<"__builtin_amdgcn_s_wakeup_barrier">,
+ Intrinsic<[], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
+ IntrNoCallback, IntrNoFree]>;
+
+def int_amdgcn_s_barrier_wait : ClangBuiltin<"__builtin_amdgcn_s_barrier_wait">,
+ Intrinsic<[], [llvm_i16_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent,
+ IntrWillReturn, IntrNoCallback, IntrNoFree]>;
+
+def int_amdgcn_s_barrier_leave : ClangBuiltin<"__builtin_amdgcn_s_barrier_leave">,
+ Intrinsic<[llvm_i1_ty], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
+
+def int_amdgcn_s_get_barrier_state : ClangBuiltin<"__builtin_amdgcn_s_get_barrier_state">,
+ Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
+ IntrNoCallback, IntrNoFree]>;
+
def int_amdgcn_wave_barrier : ClangBuiltin<"__builtin_amdgcn_wave_barrier">,
Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index d24c7da964ce85..75fac09d0b99fa 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -1791,6 +1791,19 @@ bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const {
return true;
}
}
+
+ // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
+ if (STI.hasSplitBarriers()) {
+ MachineBasicBlock *MBB = MI.getParent();
+ const DebugLoc &DL = MI.getDebugLoc();
+ BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_IMM))
+ .addImm(AMDGPU::Barrier::WORKGROUP);
+ BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_WAIT))
+ .addImm(AMDGPU::Barrier::WORKGROUP);
+ MI.eraseFromParent();
+ return true;
+ }
+
return selectImpl(MI, *CoverageInfo);
}
@@ -2137,6 +2150,16 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
break;
case Intrinsic::amdgcn_ds_bvh_stack_rtn:
return selectDSBvhStackIntrinsic(I);
+ case Intrinsic::amdgcn_s_barrier_init:
+ case Intrinsic::amdgcn_s_barrier_join:
+ case Intrinsic::amdgcn_s_wakeup_barrier:
+ case Intrinsic::amdgcn_s_get_barrier_state:
+ return selectNamedBarrierInst(I, IntrinsicID);
+ case Intrinsic::amdgcn_s_barrier_signal_isfirst:
+ case Intrinsic::amdgcn_s_barrier_signal_isfirst_var:
+ return selectSBarrierSignalIsfirst(I, IntrinsicID);
+ case Intrinsic::amdgcn_s_barrier_leave:
+ return selectSBarrierLeave(I);
}
return selectImpl(I, *CoverageInfo);
}
@@ -5239,6 +5262,135 @@ AMDGPUInstructionSelector::selectVOP3PMadMixMods(MachineOperand &Root) const {
}};
}
+bool AMDGPUInstructionSelector::selectSBarrierSignalIsfirst(
+ MachineInstr &I, Intrinsic::ID IntrID) const {
+ MachineBasicBlock *MBB = I.getParent();
+ const DebugLoc &DL = I.getDebugLoc();
+ Register CCReg = I.getOperand(0).getReg();
+
+ bool HasM0 = IntrID == Intrinsic::amdgcn_s_barrier_signal_isfirst_var;
+
+ if (HasM0) {
+ auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
+ .addReg(I.getOperand(2).getReg());
+ BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0));
+ if (!constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI))
+ return false;
+ } else {
+ BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM))
+ .addImm(I.getOperand(2).getImm());
+ }
+
+ BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC);
+
+ I.eraseFromParent();
+ return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass,
+ *MRI);
+}
+
+unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) {
+ if (HasInlineConst) {
+ switch (IntrID) {
+ default:
+ llvm_unreachable("not a named barrier op");
+ case Intrinsic::amdgcn_s_barrier_init:
+ return AMDGPU::S_BARRIER_INIT_IMM;
+ case Intrinsic::amdgcn_s_barrier_join:
+ return AMDGPU::S_BARRIER_JOIN_IMM;
+ case Intrinsic::amdgcn_s_wakeup_barrier:
+ return AMDGPU::S_WAKEUP_BARRIER_IMM;
+ case Intrinsic::amdgcn_s_get_barrier_state:
+ return AMDGPU::S_GET_BARRIER_STATE_IMM;
+ };
+ } else {
+ switch (IntrID) {
+ default:
+ llvm_unreachable("not a named barrier op");
+ case Intrinsic::amdgcn_s_barrier_init:
+ return AMDGPU::S_BARRIER_INIT_M0;
+ case Intrinsic::amdgcn_s_barrier_join:
+ return AMDGPU::S_BARRIER_JOIN_M0;
+ case Intrinsic::amdgcn_s_wakeup_barrier:
+ return AMDGPU::S_WAKEUP_BARRIER_M0;
+ case Intrinsic::amdgcn_s_get_barrier_state:
+ return AMDGPU::S_GET_BARRIER_STATE_M0;
+ };
+ }
+}
+
+bool AMDGPUInstructionSelector::selectNamedBarrierInst(
+ MachineInstr &I, Intrinsic::ID IntrID) const {
+ MachineBasicBlock *MBB = I.getParent();
+ const DebugLoc &DL = I.getDebugLoc();
+ MachineOperand BarOp = IntrID == Intrinsic::amdgcn_s_get_barrier_state
+ ? I.getOperand(2)
+ : I.getOperand(1);
+ std::optional<int64_t> BarValImm =
+ getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
+ Register M0Val;
+ Register TmpReg0;
+
+ // For S_BARRIER_INIT, member count will always be read from M0[16:22]
+ if (IntrID == Intrinsic::amdgcn_s_barrier_init) {
+ Register MemberCount = I.getOperand(2).getReg();
+ TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ // TODO: This should be expanded during legalization so that the the S_LSHL
+ // and S_OR can be constant-folded
+ BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
+ .addImm(16)
+ .addReg(MemberCount);
+ M0Val = TmpReg0;
+ }
+
+ // If not inlinable, get reference to barrier depending on the instruction
+ if (!BarValImm) {
+ if (IntrID == Intrinsic::amdgcn_s_barrier_init) {
+ // If reference to barrier id is not an inlinable constant then it must be
+ // referenced with M0[4:0]. Perform an OR with the member count to include
+ // it in M0 for S_BARRIER_INIT.
+ Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_OR_B32), TmpReg1)
+ .addReg(BarOp.getReg())
+ .addReg(TmpReg0);
+ M0Val = TmpReg1;
+ } else {
+ M0Val = BarOp.getReg();
+ }
+ }
+
+ // Build copy to M0 if needed. For S_BARRIER_INIT, M0 is always required.
+ if (M0Val) {
+ auto CopyMIB =
+ BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0).addReg(M0Val);
+ constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
+ }
+
+ MachineInstrBuilder MIB;
+ unsigned Opc = getNamedBarrierOp(BarValImm.has_value(), IntrID);
+ MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
+
+ if (IntrID == Intrinsic::amdgcn_s_get_barrier_state)
+ MIB.addDef(I.getOperand(0).getReg());
+
+ if (BarValImm)
+ MIB.addImm(*BarValImm);
+
+ I.eraseFromParent();
+ return true;
+}
+bool AMDGPUInstructionSelector::selectSBarrierLeave(MachineInstr &I) const {
+ MachineBasicBlock *BB = I.getParent();
+ const DebugLoc &DL = I.getDebugLoc();
+ Register CCReg = I.getOperand(0).getReg();
+
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_BARRIER_LEAVE));
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC);
+
+ I.eraseFromParent();
+ return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass,
+ *MRI);
+}
+
void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
const MachineInstr &MI,
int OpIdx) const {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/...
[truncated]
|
ping |
piotrAMD
reviewed
Dec 12, 2023
…eam-gfx12-sh-split-workgroup-barrier
piotrAMD
reviewed
Dec 12, 2023
piotrAMD
approved these changes
Dec 12, 2023
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM with a nit.
mariusz-sikora-at-amd
deleted the
masikora/upstream-gfx12-sh-split-workgroup-barrier
branch
December 14, 2023 10:45
jayfoad
reviewed
Dec 14, 2023
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Labels
backend:AMDGPU
clang:frontend
Language frontend issues, e.g. anything involving "Sema"
clang
Clang issues not falling into any other category
llvm:ir
mc
Machine (object) code
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
No description provided.