Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AMDGPU] GFX12: Add Split Workgroup Barrier #74836

Conversation

mariusz-sikora-at-amd
Copy link
Contributor

No description provided.

@llvmbot llvmbot added clang Clang issues not falling into any other category backend:AMDGPU clang:frontend Language frontend issues, e.g. anything involving "Sema" mc Machine (object) code llvm:ir labels Dec 8, 2023
@llvmbot
Copy link
Collaborator

llvmbot commented Dec 8, 2023

@llvm/pr-subscribers-mc

@llvm/pr-subscribers-clang

Author: Mariusz Sikora (mariusz-sikora-at-amd)

Changes

Patch is 112.63 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/74836.diff

27 Files Affected:

  • (modified) clang/include/clang/Basic/BuiltinsAMDGPU.def (+16)
  • (added) clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-err.cl (+24)
  • (added) clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl (+174)
  • (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+39)
  • (modified) llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp (+152)
  • (modified) llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h (+3)
  • (modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (+44-1)
  • (modified) llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp (+11-1)
  • (modified) llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp (+11)
  • (modified) llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h (+1)
  • (modified) llvm/lib/Target/AMDGPU/GCNSubtarget.h (+3)
  • (modified) llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp (+1)
  • (modified) llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp (+1)
  • (modified) llvm/lib/Target/AMDGPU/SIDefines.h (+9)
  • (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+111-1)
  • (modified) llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp (+5)
  • (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (+8-1)
  • (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.td (+7)
  • (modified) llvm/lib/Target/AMDGPU/SOPInstructions.td (+112)
  • (modified) llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h (+1)
  • (modified) llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp (+10)
  • (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll (+77)
  • (added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll (+1366)
  • (modified) llvm/test/MC/AMDGPU/gfx12_asm_sop1.s (+45)
  • (modified) llvm/test/MC/AMDGPU/gfx12_asm_sopp.s (+9)
  • (modified) llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sop1.txt (+53)
  • (modified) llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopp.txt (+9)
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 8b59b3790d7bc6..7465f13d552d6e 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -406,5 +406,21 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32, "iffiIb", "nc", "fp8-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_bf8_f32, "ifiiIi", "nc", "fp8-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32, "ifiiIi", "nc", "fp8-insts")
 
+//===----------------------------------------------------------------------===//
+// GFX12+ only builtins.
+//===----------------------------------------------------------------------===//
+
+TARGET_BUILTIN(__builtin_amdgcn_s_barrier_signal, "vIi", "n", "gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_s_barrier_signal_var, "vi", "n", "gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_s_barrier_wait, "vIs", "n", "gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_s_barrier_signal_isfirst, "bIi", "n", "gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_s_barrier_signal_isfirst_var, "bi", "n", "gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_s_barrier_init, "vii", "n", "gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_s_barrier_join, "vi", "n", "gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_s_wakeup_barrier, "vi", "n", "gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_s_barrier_leave, "b", "n", "gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_s_get_barrier_state, "Uii", "n", "gfx12-insts")
+
+
 #undef BUILTIN
 #undef TARGET_BUILTIN
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-err.cl
new file mode 100644
index 00000000000000..5e0153c42825e3
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-err.cl
@@ -0,0 +1,24 @@
+// REQUIRES: amdgpu-registered-target
+
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -verify -S -emit-llvm -o - %s
+
+kernel void builtins_amdgcn_s_barrier_signal_err(global int* in, global int* out, int barrier) {
+
+  __builtin_amdgcn_s_barrier_signal(barrier); // expected-error {{'__builtin_amdgcn_s_barrier_signal' must be a constant integer}}
+  __builtin_amdgcn_s_barrier_wait(-1);
+  *out = *in;
+}
+
+kernel void builtins_amdgcn_s_barrier_wait_err(global int* in, global int* out, int barrier) {
+
+  __builtin_amdgcn_s_barrier_signal(-1);
+  __builtin_amdgcn_s_barrier_wait(barrier); // expected-error {{'__builtin_amdgcn_s_barrier_wait' must be a constant integer}}
+  *out = *in;
+}
+
+kernel void builtins_amdgcn_s_barrier_signal_isfirst_err(global int* in, global int* out, int barrier) {
+
+  __builtin_amdgcn_s_barrier_signal_isfirst(barrier); // expected-error {{'__builtin_amdgcn_s_barrier_signal_isfirst' must be a constant integer}}
+  __builtin_amdgcn_s_barrier_wait(-1);
+  *out = *in;
+}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl
new file mode 100644
index 00000000000000..b8d281531e218e
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl
@@ -0,0 +1,174 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -S -emit-llvm -o - %s | FileCheck %s
+
+// CHECK-LABEL: @test_s_barrier_signal(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.signal(i32 -1)
+// CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.wait(i16 -1)
+// CHECK-NEXT:    ret void
+//
+void test_s_barrier_signal()
+{
+  __builtin_amdgcn_s_barrier_signal(-1);
+  __builtin_amdgcn_s_barrier_wait(-1);
+}
+
+// CHECK-LABEL: @test_s_barrier_signal_var(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    store i32 [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4
+// CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.signal.var(i32 [[TMP0]])
+// CHECK-NEXT:    ret void
+//
+void test_s_barrier_signal_var(int a)
+{
+  __builtin_amdgcn_s_barrier_signal_var(a);
+}
+
+// CHECK-LABEL: @test_s_barrier_signal_isfirst(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT:    store ptr [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[B:%.*]], ptr addrspace(5) [[B_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[C:%.*]], ptr addrspace(5) [[C_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 1)
+// CHECK-NEXT:    br i1 [[TMP0]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+// CHECK:       if.then:
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr addrspace(5) [[B_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[TMP1]], ptr addrspace(5) [[A_ADDR]], align 8
+// CHECK-NEXT:    br label [[IF_END:%.*]]
+// CHECK:       if.else:
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[TMP2]], ptr addrspace(5) [[A_ADDR]], align 8
+// CHECK-NEXT:    br label [[IF_END]]
+// CHECK:       if.end:
+// CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.wait(i16 1)
+// CHECK-NEXT:    ret void
+//
+void test_s_barrier_signal_isfirst(int* a, int* b, int *c)
+{
+  if(__builtin_amdgcn_s_barrier_signal_isfirst(1))
+    a = b;
+  else
+    a = c;
+
+  __builtin_amdgcn_s_barrier_wait(1);
+}
+
+// CHECK-LABEL: @test_s_barrier_isfirst_var(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT:    [[D_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    store ptr [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[B:%.*]], ptr addrspace(5) [[B_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[C:%.*]], ptr addrspace(5) [[C_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[D:%.*]], ptr addrspace(5) [[D_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[D_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.amdgcn.s.barrier.signal.isfirst.var(i32 [[TMP0]])
+// CHECK-NEXT:    br i1 [[TMP1]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+// CHECK:       if.then:
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr addrspace(5) [[B_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[TMP2]], ptr addrspace(5) [[A_ADDR]], align 8
+// CHECK-NEXT:    br label [[IF_END:%.*]]
+// CHECK:       if.else:
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[TMP3]], ptr addrspace(5) [[A_ADDR]], align 8
+// CHECK-NEXT:    br label [[IF_END]]
+// CHECK:       if.end:
+// CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.wait(i16 1)
+// CHECK-NEXT:    ret void
+//
+void test_s_barrier_isfirst_var(int* a, int* b, int *c, int d)
+{
+ if ( __builtin_amdgcn_s_barrier_signal_isfirst_var(d))
+    a = b;
+  else
+    a = c;
+
+  __builtin_amdgcn_s_barrier_wait(1);
+
+}
+
+// CHECK-LABEL: @test_s_barrier_init(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    store i32 [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4
+// CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.init(i32 1, i32 [[TMP0]])
+// CHECK-NEXT:    ret void
+//
+void test_s_barrier_init(int a)
+{
+  __builtin_amdgcn_s_barrier_init(1, a);
+}
+
+// CHECK-LABEL: @test_s_barrier_join(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.join(i32 1)
+// CHECK-NEXT:    ret void
+//
+void test_s_barrier_join()
+{
+  __builtin_amdgcn_s_barrier_join(1);
+}
+
+// CHECK-LABEL: @test_s_wakeup_barrier(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.join(i32 1)
+// CHECK-NEXT:    ret void
+//
+void test_s_wakeup_barrier()
+{
+  __builtin_amdgcn_s_barrier_join(1);
+}
+
+// CHECK-LABEL: @test_s_barrier_leave(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT:    store ptr [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[B:%.*]], ptr addrspace(5) [[B_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[C:%.*]], ptr addrspace(5) [[C_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = call i1 @llvm.amdgcn.s.barrier.leave()
+// CHECK-NEXT:    br i1 [[TMP0]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+// CHECK:       if.then:
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr addrspace(5) [[B_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[TMP1]], ptr addrspace(5) [[A_ADDR]], align 8
+// CHECK-NEXT:    br label [[IF_END:%.*]]
+// CHECK:       if.else:
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[TMP2]], ptr addrspace(5) [[A_ADDR]], align 8
+// CHECK-NEXT:    br label [[IF_END]]
+// CHECK:       if.end:
+// CHECK-NEXT:    ret void
+//
+void test_s_barrier_leave(int* a, int* b, int *c)
+{
+  if (__builtin_amdgcn_s_barrier_leave())
+    a = b;
+  else
+    a = c;
+}
+
+// CHECK-LABEL: @test_s_get_barrier_state(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[STATE:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    store i32 [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.s.get.barrier.state(i32 [[TMP0]])
+// CHECK-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[STATE]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(5) [[STATE]], align 4
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
+unsigned test_s_get_barrier_state(int a)
+{
+  unsigned State = __builtin_amdgcn_s_get_barrier_state(a);
+  return State;
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index bc9f99783d98f2..09e88152e65d2a 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -227,6 +227,45 @@ def int_amdgcn_s_sendmsg_rtn : Intrinsic <[llvm_anyint_ty], [llvm_i32_ty],
 def int_amdgcn_s_barrier : ClangBuiltin<"__builtin_amdgcn_s_barrier">,
   Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
 
+def int_amdgcn_s_barrier_signal : ClangBuiltin<"__builtin_amdgcn_s_barrier_signal">,
+  Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
+                                IntrNoCallback, IntrNoFree]>;
+
+def int_amdgcn_s_barrier_signal_var : ClangBuiltin<"__builtin_amdgcn_s_barrier_signal_var">,
+  Intrinsic<[], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
+                                IntrNoCallback, IntrNoFree]>;
+
+def int_amdgcn_s_barrier_signal_isfirst : ClangBuiltin<"__builtin_amdgcn_s_barrier_signal_isfirst">,
+  Intrinsic<[llvm_i1_ty], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent,
+                                IntrWillReturn, IntrNoCallback, IntrNoFree]>;
+
+def int_amdgcn_s_barrier_signal_isfirst_var : ClangBuiltin<"__builtin_amdgcn_s_barrier_signal_isfirst_var">,
+  Intrinsic<[llvm_i1_ty], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
+                                IntrNoCallback, IntrNoFree]>;
+
+def int_amdgcn_s_barrier_init : ClangBuiltin<"__builtin_amdgcn_s_barrier_init">,
+  Intrinsic<[], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent,
+                                IntrWillReturn, IntrNoCallback, IntrNoFree]>;
+
+def int_amdgcn_s_barrier_join : ClangBuiltin<"__builtin_amdgcn_s_barrier_join">,
+  Intrinsic<[], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
+                                IntrNoCallback, IntrNoFree]>;
+
+def int_amdgcn_s_wakeup_barrier : ClangBuiltin<"__builtin_amdgcn_s_wakeup_barrier">,
+  Intrinsic<[], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
+                                IntrNoCallback, IntrNoFree]>;
+
+def int_amdgcn_s_barrier_wait : ClangBuiltin<"__builtin_amdgcn_s_barrier_wait">,
+  Intrinsic<[], [llvm_i16_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent,
+                                IntrWillReturn, IntrNoCallback, IntrNoFree]>;
+
+def int_amdgcn_s_barrier_leave : ClangBuiltin<"__builtin_amdgcn_s_barrier_leave">,
+  Intrinsic<[llvm_i1_ty], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
+
+def int_amdgcn_s_get_barrier_state : ClangBuiltin<"__builtin_amdgcn_s_get_barrier_state">,
+  Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
+                                IntrNoCallback, IntrNoFree]>;
+
 def int_amdgcn_wave_barrier : ClangBuiltin<"__builtin_amdgcn_wave_barrier">,
   Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index d24c7da964ce85..75fac09d0b99fa 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -1791,6 +1791,19 @@ bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const {
       return true;
     }
   }
+
+  // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
+  if (STI.hasSplitBarriers()) {
+    MachineBasicBlock *MBB = MI.getParent();
+    const DebugLoc &DL = MI.getDebugLoc();
+    BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_IMM))
+        .addImm(AMDGPU::Barrier::WORKGROUP);
+    BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_WAIT))
+        .addImm(AMDGPU::Barrier::WORKGROUP);
+    MI.eraseFromParent();
+    return true;
+  }
+
   return selectImpl(MI, *CoverageInfo);
 }
 
@@ -2137,6 +2150,16 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
     break;
   case Intrinsic::amdgcn_ds_bvh_stack_rtn:
     return selectDSBvhStackIntrinsic(I);
+  case Intrinsic::amdgcn_s_barrier_init:
+  case Intrinsic::amdgcn_s_barrier_join:
+  case Intrinsic::amdgcn_s_wakeup_barrier:
+  case Intrinsic::amdgcn_s_get_barrier_state:
+    return selectNamedBarrierInst(I, IntrinsicID);
+  case Intrinsic::amdgcn_s_barrier_signal_isfirst:
+  case Intrinsic::amdgcn_s_barrier_signal_isfirst_var:
+    return selectSBarrierSignalIsfirst(I, IntrinsicID);
+  case Intrinsic::amdgcn_s_barrier_leave:
+    return selectSBarrierLeave(I);
   }
   return selectImpl(I, *CoverageInfo);
 }
@@ -5239,6 +5262,135 @@ AMDGPUInstructionSelector::selectVOP3PMadMixMods(MachineOperand &Root) const {
   }};
 }
 
+bool AMDGPUInstructionSelector::selectSBarrierSignalIsfirst(
+    MachineInstr &I, Intrinsic::ID IntrID) const {
+  MachineBasicBlock *MBB = I.getParent();
+  const DebugLoc &DL = I.getDebugLoc();
+  Register CCReg = I.getOperand(0).getReg();
+
+  bool HasM0 = IntrID == Intrinsic::amdgcn_s_barrier_signal_isfirst_var;
+
+  if (HasM0) {
+    auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
+                       .addReg(I.getOperand(2).getReg());
+    BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0));
+    if (!constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI))
+      return false;
+  } else {
+    BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM))
+        .addImm(I.getOperand(2).getImm());
+  }
+
+  BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC);
+
+  I.eraseFromParent();
+  return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass,
+                                      *MRI);
+}
+
+unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) {
+  if (HasInlineConst) {
+    switch (IntrID) {
+    default:
+      llvm_unreachable("not a named barrier op");
+    case Intrinsic::amdgcn_s_barrier_init:
+      return AMDGPU::S_BARRIER_INIT_IMM;
+    case Intrinsic::amdgcn_s_barrier_join:
+      return AMDGPU::S_BARRIER_JOIN_IMM;
+    case Intrinsic::amdgcn_s_wakeup_barrier:
+      return AMDGPU::S_WAKEUP_BARRIER_IMM;
+    case Intrinsic::amdgcn_s_get_barrier_state:
+      return AMDGPU::S_GET_BARRIER_STATE_IMM;
+    };
+  } else {
+    switch (IntrID) {
+    default:
+      llvm_unreachable("not a named barrier op");
+    case Intrinsic::amdgcn_s_barrier_init:
+      return AMDGPU::S_BARRIER_INIT_M0;
+    case Intrinsic::amdgcn_s_barrier_join:
+      return AMDGPU::S_BARRIER_JOIN_M0;
+    case Intrinsic::amdgcn_s_wakeup_barrier:
+      return AMDGPU::S_WAKEUP_BARRIER_M0;
+    case Intrinsic::amdgcn_s_get_barrier_state:
+      return AMDGPU::S_GET_BARRIER_STATE_M0;
+    };
+  }
+}
+
+bool AMDGPUInstructionSelector::selectNamedBarrierInst(
+    MachineInstr &I, Intrinsic::ID IntrID) const {
+  MachineBasicBlock *MBB = I.getParent();
+  const DebugLoc &DL = I.getDebugLoc();
+  MachineOperand BarOp = IntrID == Intrinsic::amdgcn_s_get_barrier_state
+                             ? I.getOperand(2)
+                             : I.getOperand(1);
+  std::optional<int64_t> BarValImm =
+      getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
+  Register M0Val;
+  Register TmpReg0;
+
+  // For S_BARRIER_INIT, member count will always be read from M0[16:22]
+  if (IntrID == Intrinsic::amdgcn_s_barrier_init) {
+    Register MemberCount = I.getOperand(2).getReg();
+    TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+    // TODO: This should be expanded during legalization so that the the S_LSHL
+    // and S_OR can be constant-folded
+    BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
+        .addImm(16)
+        .addReg(MemberCount);
+    M0Val = TmpReg0;
+  }
+
+  // If not inlinable, get reference to barrier depending on the instruction
+  if (!BarValImm) {
+    if (IntrID == Intrinsic::amdgcn_s_barrier_init) {
+      // If reference to barrier id is not an inlinable constant then it must be
+      // referenced with M0[4:0]. Perform an OR with the member count to include
+      // it in M0 for S_BARRIER_INIT.
+      Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+      BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_OR_B32), TmpReg1)
+          .addReg(BarOp.getReg())
+          .addReg(TmpReg0);
+      M0Val = TmpReg1;
+    } else {
+      M0Val = BarOp.getReg();
+    }
+  }
+
+  // Build copy to M0 if needed. For S_BARRIER_INIT, M0 is always required.
+  if (M0Val) {
+    auto CopyMIB =
+        BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0).addReg(M0Val);
+    constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
+  }
+
+  MachineInstrBuilder MIB;
+  unsigned Opc = getNamedBarrierOp(BarValImm.has_value(), IntrID);
+  MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
+
+  if (IntrID == Intrinsic::amdgcn_s_get_barrier_state)
+    MIB.addDef(I.getOperand(0).getReg());
+
+  if (BarValImm)
+    MIB.addImm(*BarValImm);
+
+  I.eraseFromParent();
+  return true;
+}
+bool AMDGPUInstructionSelector::selectSBarrierLeave(MachineInstr &I) const {
+  MachineBasicBlock *BB = I.getParent();
+  const DebugLoc &DL = I.getDebugLoc();
+  Register CCReg = I.getOperand(0).getReg();
+
+  BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_BARRIER_LEAVE));
+  BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC);
+
+  I.eraseFromParent();
+  return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass,
+                                      *MRI);
+}
+
 void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
                                                  const MachineInstr &MI,
                                                  int OpIdx) const {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/...
[truncated]

@mariusz-sikora-at-amd
Copy link
Contributor Author

ping

Copy link
Collaborator

@piotrAMD piotrAMD left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM with a nit.

@mariusz-sikora-at-amd mariusz-sikora-at-amd merged commit 7f55d7d into llvm:main Dec 13, 2023
4 checks passed
@mariusz-sikora-at-amd mariusz-sikora-at-amd deleted the masikora/upstream-gfx12-sh-split-workgroup-barrier branch December 14, 2023 10:45
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
backend:AMDGPU clang:frontend Language frontend issues, e.g. anything involving "Sema" clang Clang issues not falling into any other category llvm:ir mc Machine (object) code
Projects
None yet
Development

Successfully merging this pull request may close these issues.

None yet

5 participants