diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 8b59b3790d7bc..7465f13d552d6 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -406,5 +406,21 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32, "iffiIb", "nc", "fp8-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_bf8_f32, "ifiiIi", "nc", "fp8-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32, "ifiiIi", "nc", "fp8-insts") +//===----------------------------------------------------------------------===// +// GFX12+ only builtins. +//===----------------------------------------------------------------------===// + +TARGET_BUILTIN(__builtin_amdgcn_s_barrier_signal, "vIi", "n", "gfx12-insts") +TARGET_BUILTIN(__builtin_amdgcn_s_barrier_signal_var, "vi", "n", "gfx12-insts") +TARGET_BUILTIN(__builtin_amdgcn_s_barrier_wait, "vIs", "n", "gfx12-insts") +TARGET_BUILTIN(__builtin_amdgcn_s_barrier_signal_isfirst, "bIi", "n", "gfx12-insts") +TARGET_BUILTIN(__builtin_amdgcn_s_barrier_signal_isfirst_var, "bi", "n", "gfx12-insts") +TARGET_BUILTIN(__builtin_amdgcn_s_barrier_init, "vii", "n", "gfx12-insts") +TARGET_BUILTIN(__builtin_amdgcn_s_barrier_join, "vi", "n", "gfx12-insts") +TARGET_BUILTIN(__builtin_amdgcn_s_wakeup_barrier, "vi", "n", "gfx12-insts") +TARGET_BUILTIN(__builtin_amdgcn_s_barrier_leave, "b", "n", "gfx12-insts") +TARGET_BUILTIN(__builtin_amdgcn_s_get_barrier_state, "Uii", "n", "gfx12-insts") + + #undef BUILTIN #undef TARGET_BUILTIN diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-err.cl new file mode 100644 index 0000000000000..5e0153c42825e --- /dev/null +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-err.cl @@ -0,0 +1,24 @@ +// REQUIRES: amdgpu-registered-target + +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -verify -S -emit-llvm -o - %s + +kernel void builtins_amdgcn_s_barrier_signal_err(global int* in, global int* out, int barrier) { + + __builtin_amdgcn_s_barrier_signal(barrier); // expected-error {{'__builtin_amdgcn_s_barrier_signal' must be a constant integer}} + __builtin_amdgcn_s_barrier_wait(-1); + *out = *in; +} + +kernel void builtins_amdgcn_s_barrier_wait_err(global int* in, global int* out, int barrier) { + + __builtin_amdgcn_s_barrier_signal(-1); + __builtin_amdgcn_s_barrier_wait(barrier); // expected-error {{'__builtin_amdgcn_s_barrier_wait' must be a constant integer}} + *out = *in; +} + +kernel void builtins_amdgcn_s_barrier_signal_isfirst_err(global int* in, global int* out, int barrier) { + + __builtin_amdgcn_s_barrier_signal_isfirst(barrier); // expected-error {{'__builtin_amdgcn_s_barrier_signal_isfirst' must be a constant integer}} + __builtin_amdgcn_s_barrier_wait(-1); + *out = *in; +} diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl new file mode 100644 index 0000000000000..b8d281531e218 --- /dev/null +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl @@ -0,0 +1,174 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -S -emit-llvm -o - %s | FileCheck %s + +// CHECK-LABEL: @test_s_barrier_signal( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal(i32 -1) +// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -1) +// CHECK-NEXT: ret void +// +void test_s_barrier_signal() +{ + __builtin_amdgcn_s_barrier_signal(-1); + __builtin_amdgcn_s_barrier_wait(-1); +} + +// CHECK-LABEL: @test_s_barrier_signal_var( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: store i32 [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4 +// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +void test_s_barrier_signal_var(int a) +{ + __builtin_amdgcn_s_barrier_signal_var(a); +} + +// CHECK-LABEL: @test_s_barrier_signal_isfirst( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: store ptr [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 8 +// CHECK-NEXT: store ptr [[B:%.*]], ptr addrspace(5) [[B_ADDR]], align 8 +// CHECK-NEXT: store ptr [[C:%.*]], ptr addrspace(5) [[C_ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 1) +// CHECK-NEXT: br i1 [[TMP0]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] +// CHECK: if.then: +// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr addrspace(5) [[B_ADDR]], align 8 +// CHECK-NEXT: store ptr [[TMP1]], ptr addrspace(5) [[A_ADDR]], align 8 +// CHECK-NEXT: br label [[IF_END:%.*]] +// CHECK: if.else: +// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8 +// CHECK-NEXT: store ptr [[TMP2]], ptr addrspace(5) [[A_ADDR]], align 8 +// CHECK-NEXT: br label [[IF_END]] +// CHECK: if.end: +// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1) +// CHECK-NEXT: ret void +// +void test_s_barrier_signal_isfirst(int* a, int* b, int *c) +{ + if(__builtin_amdgcn_s_barrier_signal_isfirst(1)) + a = b; + else + a = c; + + __builtin_amdgcn_s_barrier_wait(1); +} + +// CHECK-LABEL: @test_s_barrier_isfirst_var( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[D_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: store ptr [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 8 +// CHECK-NEXT: store ptr [[B:%.*]], ptr addrspace(5) [[B_ADDR]], align 8 +// CHECK-NEXT: store ptr [[C:%.*]], ptr addrspace(5) [[C_ADDR]], align 8 +// CHECK-NEXT: store i32 [[D:%.*]], ptr addrspace(5) [[D_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[D_ADDR]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.s.barrier.signal.isfirst.var(i32 [[TMP0]]) +// CHECK-NEXT: br i1 [[TMP1]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] +// CHECK: if.then: +// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr addrspace(5) [[B_ADDR]], align 8 +// CHECK-NEXT: store ptr [[TMP2]], ptr addrspace(5) [[A_ADDR]], align 8 +// CHECK-NEXT: br label [[IF_END:%.*]] +// CHECK: if.else: +// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8 +// CHECK-NEXT: store ptr [[TMP3]], ptr addrspace(5) [[A_ADDR]], align 8 +// CHECK-NEXT: br label [[IF_END]] +// CHECK: if.end: +// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1) +// CHECK-NEXT: ret void +// +void test_s_barrier_isfirst_var(int* a, int* b, int *c, int d) +{ + if ( __builtin_amdgcn_s_barrier_signal_isfirst_var(d)) + a = b; + else + a = c; + + __builtin_amdgcn_s_barrier_wait(1); + +} + +// CHECK-LABEL: @test_s_barrier_init( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: store i32 [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4 +// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.init(i32 1, i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +void test_s_barrier_init(int a) +{ + __builtin_amdgcn_s_barrier_init(1, a); +} + +// CHECK-LABEL: @test_s_barrier_join( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(i32 1) +// CHECK-NEXT: ret void +// +void test_s_barrier_join() +{ + __builtin_amdgcn_s_barrier_join(1); +} + +// CHECK-LABEL: @test_s_wakeup_barrier( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(i32 1) +// CHECK-NEXT: ret void +// +void test_s_wakeup_barrier() +{ + __builtin_amdgcn_s_barrier_join(1); +} + +// CHECK-LABEL: @test_s_barrier_leave( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: store ptr [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 8 +// CHECK-NEXT: store ptr [[B:%.*]], ptr addrspace(5) [[B_ADDR]], align 8 +// CHECK-NEXT: store ptr [[C:%.*]], ptr addrspace(5) [[C_ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = call i1 @llvm.amdgcn.s.barrier.leave() +// CHECK-NEXT: br i1 [[TMP0]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] +// CHECK: if.then: +// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr addrspace(5) [[B_ADDR]], align 8 +// CHECK-NEXT: store ptr [[TMP1]], ptr addrspace(5) [[A_ADDR]], align 8 +// CHECK-NEXT: br label [[IF_END:%.*]] +// CHECK: if.else: +// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8 +// CHECK-NEXT: store ptr [[TMP2]], ptr addrspace(5) [[A_ADDR]], align 8 +// CHECK-NEXT: br label [[IF_END]] +// CHECK: if.end: +// CHECK-NEXT: ret void +// +void test_s_barrier_leave(int* a, int* b, int *c) +{ + if (__builtin_amdgcn_s_barrier_leave()) + a = b; + else + a = c; +} + +// CHECK-LABEL: @test_s_get_barrier_state( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[STATE:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: store i32 [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.s.get.barrier.state(i32 [[TMP0]]) +// CHECK-NEXT: store i32 [[TMP1]], ptr addrspace(5) [[STATE]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[STATE]], align 4 +// CHECK-NEXT: ret i32 [[TMP2]] +// +unsigned test_s_get_barrier_state(int a) +{ + unsigned State = __builtin_amdgcn_s_get_barrier_state(a); + return State; +} diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index bc9f99783d98f..09e88152e65d2 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -227,6 +227,45 @@ def int_amdgcn_s_sendmsg_rtn : Intrinsic <[llvm_anyint_ty], [llvm_i32_ty], def int_amdgcn_s_barrier : ClangBuiltin<"__builtin_amdgcn_s_barrier">, Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; +def int_amdgcn_s_barrier_signal : ClangBuiltin<"__builtin_amdgcn_s_barrier_signal">, + Intrinsic<[], [llvm_i32_ty], [ImmArg>, IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, + IntrNoCallback, IntrNoFree]>; + +def int_amdgcn_s_barrier_signal_var : ClangBuiltin<"__builtin_amdgcn_s_barrier_signal_var">, + Intrinsic<[], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, + IntrNoCallback, IntrNoFree]>; + +def int_amdgcn_s_barrier_signal_isfirst : ClangBuiltin<"__builtin_amdgcn_s_barrier_signal_isfirst">, + Intrinsic<[llvm_i1_ty], [llvm_i32_ty], [ImmArg>, IntrNoMem, IntrHasSideEffects, IntrConvergent, + IntrWillReturn, IntrNoCallback, IntrNoFree]>; + +def int_amdgcn_s_barrier_signal_isfirst_var : ClangBuiltin<"__builtin_amdgcn_s_barrier_signal_isfirst_var">, + Intrinsic<[llvm_i1_ty], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, + IntrNoCallback, IntrNoFree]>; + +def int_amdgcn_s_barrier_init : ClangBuiltin<"__builtin_amdgcn_s_barrier_init">, + Intrinsic<[], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, + IntrWillReturn, IntrNoCallback, IntrNoFree]>; + +def int_amdgcn_s_barrier_join : ClangBuiltin<"__builtin_amdgcn_s_barrier_join">, + Intrinsic<[], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, + IntrNoCallback, IntrNoFree]>; + +def int_amdgcn_s_wakeup_barrier : ClangBuiltin<"__builtin_amdgcn_s_wakeup_barrier">, + Intrinsic<[], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, + IntrNoCallback, IntrNoFree]>; + +def int_amdgcn_s_barrier_wait : ClangBuiltin<"__builtin_amdgcn_s_barrier_wait">, + Intrinsic<[], [llvm_i16_ty], [ImmArg>, IntrNoMem, IntrHasSideEffects, IntrConvergent, + IntrWillReturn, IntrNoCallback, IntrNoFree]>; + +def int_amdgcn_s_barrier_leave : ClangBuiltin<"__builtin_amdgcn_s_barrier_leave">, + Intrinsic<[llvm_i1_ty], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; + +def int_amdgcn_s_get_barrier_state : ClangBuiltin<"__builtin_amdgcn_s_get_barrier_state">, + Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, + IntrNoCallback, IntrNoFree]>; + def int_amdgcn_wave_barrier : ClangBuiltin<"__builtin_amdgcn_wave_barrier">, Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index d24c7da964ce8..75fac09d0b99f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -1791,6 +1791,19 @@ bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const { return true; } } + + // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait + if (STI.hasSplitBarriers()) { + MachineBasicBlock *MBB = MI.getParent(); + const DebugLoc &DL = MI.getDebugLoc(); + BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_IMM)) + .addImm(AMDGPU::Barrier::WORKGROUP); + BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_WAIT)) + .addImm(AMDGPU::Barrier::WORKGROUP); + MI.eraseFromParent(); + return true; + } + return selectImpl(MI, *CoverageInfo); } @@ -2137,6 +2150,16 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( break; case Intrinsic::amdgcn_ds_bvh_stack_rtn: return selectDSBvhStackIntrinsic(I); + case Intrinsic::amdgcn_s_barrier_init: + case Intrinsic::amdgcn_s_barrier_join: + case Intrinsic::amdgcn_s_wakeup_barrier: + case Intrinsic::amdgcn_s_get_barrier_state: + return selectNamedBarrierInst(I, IntrinsicID); + case Intrinsic::amdgcn_s_barrier_signal_isfirst: + case Intrinsic::amdgcn_s_barrier_signal_isfirst_var: + return selectSBarrierSignalIsfirst(I, IntrinsicID); + case Intrinsic::amdgcn_s_barrier_leave: + return selectSBarrierLeave(I); } return selectImpl(I, *CoverageInfo); } @@ -5239,6 +5262,135 @@ AMDGPUInstructionSelector::selectVOP3PMadMixMods(MachineOperand &Root) const { }}; } +bool AMDGPUInstructionSelector::selectSBarrierSignalIsfirst( + MachineInstr &I, Intrinsic::ID IntrID) const { + MachineBasicBlock *MBB = I.getParent(); + const DebugLoc &DL = I.getDebugLoc(); + Register CCReg = I.getOperand(0).getReg(); + + bool HasM0 = IntrID == Intrinsic::amdgcn_s_barrier_signal_isfirst_var; + + if (HasM0) { + auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) + .addReg(I.getOperand(2).getReg()); + BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0)); + if (!constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI)) + return false; + } else { + BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM)) + .addImm(I.getOperand(2).getImm()); + } + + BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC); + + I.eraseFromParent(); + return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass, + *MRI); +} + +unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) { + if (HasInlineConst) { + switch (IntrID) { + default: + llvm_unreachable("not a named barrier op"); + case Intrinsic::amdgcn_s_barrier_init: + return AMDGPU::S_BARRIER_INIT_IMM; + case Intrinsic::amdgcn_s_barrier_join: + return AMDGPU::S_BARRIER_JOIN_IMM; + case Intrinsic::amdgcn_s_wakeup_barrier: + return AMDGPU::S_WAKEUP_BARRIER_IMM; + case Intrinsic::amdgcn_s_get_barrier_state: + return AMDGPU::S_GET_BARRIER_STATE_IMM; + }; + } else { + switch (IntrID) { + default: + llvm_unreachable("not a named barrier op"); + case Intrinsic::amdgcn_s_barrier_init: + return AMDGPU::S_BARRIER_INIT_M0; + case Intrinsic::amdgcn_s_barrier_join: + return AMDGPU::S_BARRIER_JOIN_M0; + case Intrinsic::amdgcn_s_wakeup_barrier: + return AMDGPU::S_WAKEUP_BARRIER_M0; + case Intrinsic::amdgcn_s_get_barrier_state: + return AMDGPU::S_GET_BARRIER_STATE_M0; + }; + } +} + +bool AMDGPUInstructionSelector::selectNamedBarrierInst( + MachineInstr &I, Intrinsic::ID IntrID) const { + MachineBasicBlock *MBB = I.getParent(); + const DebugLoc &DL = I.getDebugLoc(); + MachineOperand BarOp = IntrID == Intrinsic::amdgcn_s_get_barrier_state + ? I.getOperand(2) + : I.getOperand(1); + std::optional BarValImm = + getIConstantVRegSExtVal(BarOp.getReg(), *MRI); + Register M0Val; + Register TmpReg0; + + // For S_BARRIER_INIT, member count will always be read from M0[16:22] + if (IntrID == Intrinsic::amdgcn_s_barrier_init) { + Register MemberCount = I.getOperand(2).getReg(); + TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); + // TODO: This should be expanded during legalization so that the the S_LSHL + // and S_OR can be constant-folded + BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0) + .addImm(16) + .addReg(MemberCount); + M0Val = TmpReg0; + } + + // If not inlinable, get reference to barrier depending on the instruction + if (!BarValImm) { + if (IntrID == Intrinsic::amdgcn_s_barrier_init) { + // If reference to barrier id is not an inlinable constant then it must be + // referenced with M0[4:0]. Perform an OR with the member count to include + // it in M0 for S_BARRIER_INIT. + Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); + BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_OR_B32), TmpReg1) + .addReg(BarOp.getReg()) + .addReg(TmpReg0); + M0Val = TmpReg1; + } else { + M0Val = BarOp.getReg(); + } + } + + // Build copy to M0 if needed. For S_BARRIER_INIT, M0 is always required. + if (M0Val) { + auto CopyMIB = + BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0).addReg(M0Val); + constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI); + } + + MachineInstrBuilder MIB; + unsigned Opc = getNamedBarrierOp(BarValImm.has_value(), IntrID); + MIB = BuildMI(*MBB, &I, DL, TII.get(Opc)); + + if (IntrID == Intrinsic::amdgcn_s_get_barrier_state) + MIB.addDef(I.getOperand(0).getReg()); + + if (BarValImm) + MIB.addImm(*BarValImm); + + I.eraseFromParent(); + return true; +} +bool AMDGPUInstructionSelector::selectSBarrierLeave(MachineInstr &I) const { + MachineBasicBlock *BB = I.getParent(); + const DebugLoc &DL = I.getDebugLoc(); + Register CCReg = I.getOperand(0).getReg(); + + BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_BARRIER_LEAVE)); + BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC); + + I.eraseFromParent(); + return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass, + *MRI); +} + void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index c93e3de66d405..00ff1747ce57a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -149,6 +149,9 @@ class AMDGPUInstructionSelector final : public InstructionSelector { bool selectSMFMACIntrin(MachineInstr &I) const; bool selectWaveAddress(MachineInstr &I) const; bool selectStackRestore(MachineInstr &MI) const; + bool selectNamedBarrierInst(MachineInstr &I, Intrinsic::ID IID) const; + bool selectSBarrierSignalIsfirst(MachineInstr &I, Intrinsic::ID IID) const; + bool selectSBarrierLeave(MachineInstr &I) const; std::pair selectVOP3ModsImpl(MachineOperand &Root, bool IsCanonicalizing = true, diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index df8e0c9400678..03b6d19b2b3c0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -3101,6 +3101,22 @@ void AMDGPURegisterBankInfo::applyMappingImpl( applyDefaultMapping(OpdMapper); constrainOpWithReadfirstlane(B, MI, 8); // M0 return; + case Intrinsic::amdgcn_s_barrier_signal_var: + case Intrinsic::amdgcn_s_barrier_join: + case Intrinsic::amdgcn_s_wakeup_barrier: + constrainOpWithReadfirstlane(B, MI, 1); + return; + case Intrinsic::amdgcn_s_barrier_signal_isfirst_var: + constrainOpWithReadfirstlane(B, MI, 2); + return; + case Intrinsic::amdgcn_s_barrier_init: + constrainOpWithReadfirstlane(B, MI, 1); + constrainOpWithReadfirstlane(B, MI, 2); + return; + case Intrinsic::amdgcn_s_get_barrier_state: { + constrainOpWithReadfirstlane(B, MI, 2); + return; + } default: { if (const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(IntrID)) { @@ -4833,7 +4849,34 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); // %data1 break; } - + case Intrinsic::amdgcn_s_barrier_signal_var: + case Intrinsic::amdgcn_s_barrier_join: + case Intrinsic::amdgcn_s_wakeup_barrier: + OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); + break; + case Intrinsic::amdgcn_s_barrier_init: + OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); + OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); + break; + case Intrinsic::amdgcn_s_barrier_signal_isfirst_var: { + const unsigned ResultSize = 1; + OpdsMapping[0] = + AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, ResultSize); + OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); + break; + } + case Intrinsic::amdgcn_s_barrier_signal_isfirst: + case Intrinsic::amdgcn_s_barrier_leave: { + const unsigned ResultSize = 1; + OpdsMapping[0] = + AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, ResultSize); + break; + } + case Intrinsic::amdgcn_s_get_barrier_state: { + OpdsMapping[0] = getSGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); + OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); + break; + } default: return getInvalidInstructionMapping(); } diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 7e8080c5498d6..ced2a1214af84 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -893,6 +893,7 @@ class AMDGPUOperand : public MCParsedAsmOperand { bool isSDelayALU() const; bool isHwreg() const; bool isSendMsg() const; + bool isSplitBarrier() const; bool isSwizzle() const; bool isSMRDOffset8() const; bool isSMEMOffset() const; @@ -1856,6 +1857,7 @@ static const fltSemantics *getOpFltSemantics(uint8_t OperandType) { case AMDGPU::OPERAND_REG_INLINE_C_V2INT32: case AMDGPU::OPERAND_REG_IMM_V2INT32: case AMDGPU::OPERAND_KIMM32: + case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32: return &APFloat::IEEEsingle(); case AMDGPU::OPERAND_REG_IMM_INT64: case AMDGPU::OPERAND_REG_IMM_FP64: @@ -2185,7 +2187,8 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo case AMDGPU::OPERAND_REG_INLINE_C_V2INT32: case AMDGPU::OPERAND_REG_IMM_V2INT32: case AMDGPU::OPERAND_KIMM32: - case AMDGPU::OPERAND_KIMM16: { + case AMDGPU::OPERAND_KIMM16: + case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32: { bool lost; APFloat FPLiteral(APFloat::IEEEdouble(), Literal); // Convert literal to single precision @@ -2226,6 +2229,7 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo case AMDGPU::OPERAND_REG_INLINE_C_V2FP32: case AMDGPU::OPERAND_REG_IMM_V2INT32: case AMDGPU::OPERAND_REG_INLINE_C_V2INT32: + case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32: if (isSafeTruncation(Val, 32) && AMDGPU::isInlinableLiteral32(static_cast(Val), AsmParser->hasInv2PiInlineImm())) { @@ -9161,3 +9165,9 @@ bool AMDGPUOperand::isWaitVDST() const { bool AMDGPUOperand::isWaitEXP() const { return isImmTy(ImmTyWaitEXP) && isUInt<3>(getImm()); } + +//===----------------------------------------------------------------------===// +// Split Barrier +//===----------------------------------------------------------------------===// + +bool AMDGPUOperand::isSplitBarrier() const { return isInlinableImm(MVT::i32); } diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index 01a7bd587cd63..1cfc5af571c1f 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -107,6 +107,13 @@ static DecodeStatus decodeBoolReg(MCInst &Inst, unsigned Val, uint64_t Addr, return addOperand(Inst, DAsm->decodeBoolReg(Val)); } +static DecodeStatus decodeSplitBarrier(MCInst &Inst, unsigned Val, + uint64_t Addr, + const MCDisassembler *Decoder) { + auto DAsm = static_cast(Decoder); + return addOperand(Inst, DAsm->decodeSplitBarrier(Val)); +} + #define DECODE_OPERAND(StaticDecoderName, DecoderName) \ static DecodeStatus StaticDecoderName(MCInst &Inst, unsigned Imm, \ uint64_t /*Addr*/, \ @@ -1747,6 +1754,10 @@ MCOperand AMDGPUDisassembler::decodeBoolReg(unsigned Val) const { : decodeSrcOp(OPW32, Val); } +MCOperand AMDGPUDisassembler::decodeSplitBarrier(unsigned Val) const { + return decodeSrcOp(OPW32, Val); +} + bool AMDGPUDisassembler::isVI() const { return STI.hasFeature(AMDGPU::FeatureVolcanicIslands); } diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h index 7e233dcb54ea1..233581949d712 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h @@ -251,6 +251,7 @@ class AMDGPUDisassembler : public MCDisassembler { MCOperand decodeSDWAVopcDst(unsigned Val) const; MCOperand decodeBoolReg(unsigned Val) const; + MCOperand decodeSplitBarrier(unsigned Val) const; int getTTmpIdx(unsigned Val) const; diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index ac864325230f8..af9bab7cfe978 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -1206,6 +1206,9 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, return hasKernargPreload() && !hasGFX940Insts(); } + // \returns true if the target has split barriers feature + bool hasSplitBarriers() const { return getGeneration() >= GFX12; } + // \returns true if FP8/BF8 VOP1 form of conversion to F32 is unreliable. bool hasCvtFP8VOP1Bug() const { return true; } diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index 57f74ae08b35c..6280ff55ad24f 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -708,6 +708,7 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo, case AMDGPU::OPERAND_REG_INLINE_C_V2INT32: case AMDGPU::OPERAND_REG_INLINE_C_V2FP32: case MCOI::OPERAND_IMMEDIATE: + case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32: printImmediate32(Op.getImm(), STI, O); break; case AMDGPU::OPERAND_REG_IMM_INT64: diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp index 80e7ca2b39d1b..b403d69d9ff13 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp @@ -262,6 +262,7 @@ AMDGPUMCCodeEmitter::getLitEncoding(const MCOperand &MO, case AMDGPU::OPERAND_REG_IMM_V2FP32: case AMDGPU::OPERAND_REG_INLINE_C_V2INT32: case AMDGPU::OPERAND_REG_INLINE_C_V2FP32: + case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32: return getLit32Encoding(static_cast(Imm), STI); case AMDGPU::OPERAND_REG_IMM_INT64: diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h index 42a5d14155f4f..b291400a947c7 100644 --- a/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/llvm/lib/Target/AMDGPU/SIDefines.h @@ -213,6 +213,9 @@ enum OperandType : unsigned { OPERAND_REG_INLINE_C_V2INT32, OPERAND_REG_INLINE_C_V2FP32, + // Operand for split barrier inline constant + OPERAND_INLINE_SPLIT_BARRIER_INT32, + /// Operand with 32-bit immediate that uses the constant bus. OPERAND_KIMM32, OPERAND_KIMM16, @@ -1026,6 +1029,12 @@ enum Register_Flag : uint8_t { } // namespace AMDGPU +namespace AMDGPU { +namespace Barrier { +enum Type { TRAP = -2, WORKGROUP = -1 }; +} // namespace Barrier +} // namespace AMDGPU + // clang-format off #define R_00B028_SPI_SHADER_PGM_RSRC1_PS 0x00B028 diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index f37f2d1a4b688..708f212e204ac 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -8629,6 +8629,31 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, M->getVTList(), Ops, M->getMemoryVT(), M->getMemOperand()); } + case Intrinsic::amdgcn_s_get_barrier_state: { + SDValue Chain = Op->getOperand(0); + SmallVector Ops; + unsigned Opc; + bool IsInlinableBarID = false; + int64_t BarID; + + if (isa(Op->getOperand(2))) { + BarID = cast(Op->getOperand(2))->getSExtValue(); + IsInlinableBarID = AMDGPU::isInlinableIntLiteral(BarID); + } + + if (IsInlinableBarID) { + Opc = AMDGPU::S_GET_BARRIER_STATE_IMM; + SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32); + Ops.push_back(K); + } else { + Opc = AMDGPU::S_GET_BARRIER_STATE_M0; + SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(2)); + Ops.push_back(M0Val.getValue(0)); + } + + auto NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops); + return SDValue(NewMI, 0); + } default: if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = @@ -8806,13 +8831,29 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0); } case Intrinsic::amdgcn_s_barrier: { + const GCNSubtarget &ST = MF.getSubtarget(); if (getTargetMachine().getOptLevel() > CodeGenOptLevel::None) { - const GCNSubtarget &ST = MF.getSubtarget(); unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second; if (WGSize <= ST.getWavefrontSize()) return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, MVT::Other, Op.getOperand(0)), 0); } + + // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait + if (ST.hasSplitBarriers()) { + SDValue K = + DAG.getTargetConstant(AMDGPU::Barrier::WORKGROUP, DL, MVT::i32); + SDValue BarSignal = + SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_SIGNAL_IMM, DL, + MVT::Other, K, Op.getOperand(0)), + 0); + SDValue BarWait = + SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_WAIT, DL, MVT::Other, K, + BarSignal.getValue(0)), + 0); + return BarWait; + } + return SDValue(); }; case Intrinsic::amdgcn_tbuffer_store: { @@ -9198,7 +9239,76 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, case Intrinsic::amdgcn_end_cf: return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other, Op->getOperand(2), Chain), 0); + case Intrinsic::amdgcn_s_barrier_init: + case Intrinsic::amdgcn_s_barrier_join: + case Intrinsic::amdgcn_s_wakeup_barrier: { + SDValue Chain = Op->getOperand(0); + SmallVector Ops; + SDValue BarOp = Op->getOperand(2); + unsigned Opc; + bool IsInlinableBarID = false; + int64_t BarVal; + + if (isa(BarOp)) { + BarVal = cast(BarOp)->getSExtValue(); + IsInlinableBarID = AMDGPU::isInlinableIntLiteral(BarVal); + } + if (IsInlinableBarID) { + switch (IntrinsicID) { + default: + return SDValue(); + case Intrinsic::amdgcn_s_barrier_init: + Opc = AMDGPU::S_BARRIER_INIT_IMM; + break; + case Intrinsic::amdgcn_s_barrier_join: + Opc = AMDGPU::S_BARRIER_JOIN_IMM; + break; + case Intrinsic::amdgcn_s_wakeup_barrier: + Opc = AMDGPU::S_WAKEUP_BARRIER_IMM; + break; + } + + SDValue K = DAG.getTargetConstant(BarVal, DL, MVT::i32); + Ops.push_back(K); + } else { + switch (IntrinsicID) { + default: + return SDValue(); + case Intrinsic::amdgcn_s_barrier_init: + Opc = AMDGPU::S_BARRIER_INIT_M0; + break; + case Intrinsic::amdgcn_s_barrier_join: + Opc = AMDGPU::S_BARRIER_JOIN_M0; + break; + case Intrinsic::amdgcn_s_wakeup_barrier: + Opc = AMDGPU::S_WAKEUP_BARRIER_M0; + break; + } + } + + if (IntrinsicID == Intrinsic::amdgcn_s_barrier_init) { + SDValue M0Val; + // Member count will be read from M0[16:22] + M0Val = DAG.getNode(ISD::SHL, DL, MVT::i32, Op.getOperand(3), + DAG.getShiftAmountConstant(16, MVT::i32, DL)); + + if (!IsInlinableBarID) { + // If reference to barrier id is not an inline constant then it must be + // referenced with M0[4:0]. Perform an OR with the member count to + // include it in M0. + M0Val = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, + Op.getOperand(2), M0Val), + 0); + } + Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0)); + } else if (!IsInlinableBarID) { + Ops.push_back(copyToM0(DAG, Chain, DL, BarOp).getValue(0)); + } + + auto NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops); + return SDValue(NewMI, 0); + } default: { if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrinsicID)) diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index ede4841b8a5fd..13426e7321b67 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1504,6 +1504,11 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, break; case AMDGPU::S_MEMTIME: case AMDGPU::S_MEMREALTIME: + case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0: + case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: + case AMDGPU::S_BARRIER_LEAVE: + case AMDGPU::S_GET_BARRIER_STATE_M0: + case AMDGPU::S_GET_BARRIER_STATE_IMM: ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst); break; } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 8a226a321af09..d4746b559d925 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -4118,7 +4118,8 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, case AMDGPU::OPERAND_REG_IMM_V2INT32: case AMDGPU::OPERAND_REG_INLINE_C_V2INT32: case AMDGPU::OPERAND_REG_INLINE_AC_INT32: - case AMDGPU::OPERAND_REG_INLINE_AC_FP32: { + case AMDGPU::OPERAND_REG_INLINE_AC_FP32: + case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32: { int32_t Trunc = static_cast(Imm); return AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm()); } @@ -4559,6 +4560,12 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, } break; } + case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32: + if (!MI.getOperand(i).isImm() || !isInlineConstant(MI, i)) { + ErrInfo = "Expected inline constant for operand."; + return false; + } + break; case MCOI::OPERAND_IMMEDIATE: case AMDGPU::OPERAND_KIMM32: // Check if this operand is an immediate. diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 9e60bdda5ef3a..6c106b8b68b5c 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -919,6 +919,13 @@ def InterpAttr : CustomOperand; def InterpAttrChan : ImmOperand; +def SplitBarrier : ImmOperand { + let OperandNamespace = "AMDGPU"; + let OperandType = "OPERAND_INLINE_SPLIT_BARRIER_INT32"; + let DecoderMethod = "decodeSplitBarrier"; + let PrintMethod = "printOperand"; +} + def VReg32OrOffClass : AsmOperandClass { let Name = "VReg32OrOff"; let ParserMethod = "parseVReg32OrOff"; diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index b9a488f9cf7b3..50c4d279cfe23 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -438,6 +438,89 @@ let SubtargetPredicate = HasSALUFloatInsts, Uses = [MODE], } // End SubtargetPredicate = HasSALUFloatInsts, Uses = [MODE] // SchedRW = [WriteSFPU], isReMaterializable = 1 +let hasSideEffects = 1 in { +let has_sdst = 0 in { +let Uses = [M0] in { +def S_BARRIER_SIGNAL_M0 : SOP1_Pseudo <"s_barrier_signal m0", (outs), (ins), + "", [(int_amdgcn_s_barrier_signal_var M0)]>{ + let SchedRW = [WriteBarrier]; + let isConvergent = 1; +} + +def S_BARRIER_SIGNAL_ISFIRST_M0 : SOP1_Pseudo <"s_barrier_signal_isfirst m0", (outs), (ins), + "", [(set SCC, (int_amdgcn_s_barrier_signal_isfirst_var M0))]>{ + let Defs = [SCC]; + let SchedRW = [WriteBarrier]; + let isConvergent = 1; +} + +def S_BARRIER_INIT_M0 : SOP1_Pseudo <"s_barrier_init m0", (outs), (ins), + "", []>{ + let SchedRW = [WriteBarrier]; + let isConvergent = 1; +} + +def S_BARRIER_INIT_IMM : SOP1_Pseudo <"s_barrier_init", (outs), + (ins SplitBarrier:$src0), "$src0", []>{ + let SchedRW = [WriteBarrier]; + let isConvergent = 1; +} + +def S_BARRIER_JOIN_M0 : SOP1_Pseudo <"s_barrier_join m0", (outs), (ins), + "", []>{ + let SchedRW = [WriteBarrier]; + let isConvergent = 1; +} + +def S_WAKEUP_BARRIER_M0 : SOP1_Pseudo <"s_wakeup_barrier m0", (outs), (ins), + "", []>{ + let SchedRW = [WriteBarrier]; + let isConvergent = 1; +} +} // End Uses = [M0] + +def S_BARRIER_SIGNAL_IMM : SOP1_Pseudo <"s_barrier_signal", (outs), + (ins SplitBarrier:$src0), "$src0", [(int_amdgcn_s_barrier_signal timm:$src0)]>{ + let SchedRW = [WriteBarrier]; + let isConvergent = 1; +} + +def S_BARRIER_SIGNAL_ISFIRST_IMM : SOP1_Pseudo <"s_barrier_signal_isfirst", (outs), + (ins SplitBarrier:$src0), "$src0", [(set SCC, (int_amdgcn_s_barrier_signal_isfirst timm:$src0))]>{ + let Defs = [SCC]; + let SchedRW = [WriteBarrier]; + let isConvergent = 1; +} + +def S_BARRIER_JOIN_IMM : SOP1_Pseudo <"s_barrier_join", (outs), + (ins SplitBarrier:$src0), "$src0", []>{ + let SchedRW = [WriteBarrier]; + let isConvergent = 1; +} + +def S_WAKEUP_BARRIER_IMM : SOP1_Pseudo <"s_wakeup_barrier", (outs), + (ins SplitBarrier:$src0), "$src0", []>{ + let SchedRW = [WriteBarrier]; + let isConvergent = 1; + + +} +} // End has_sdst = 0 + +def S_GET_BARRIER_STATE_IMM : SOP1_Pseudo <"s_get_barrier_state", (outs SSrc_b32:$sdst), + (ins SplitBarrier:$src0), "$sdst, $src0", []>{ + let SchedRW = [WriteBarrier]; + let isConvergent = 1; +} + +def S_GET_BARRIER_STATE_M0 : SOP1_Pseudo <"s_get_barrier_state $sdst, m0", (outs SSrc_b32:$sdst), + (ins), "", []>{ + let Uses = [M0]; + let SchedRW = [WriteBarrier]; + let isConvergent = 1; +} +} // End hasSideEffects = 1 + //===----------------------------------------------------------------------===// // SOP2 Instructions //===----------------------------------------------------------------------===// @@ -1482,6 +1565,21 @@ def S_BARRIER : SOPP_Pseudo <"s_barrier", (ins), "", let isConvergent = 1; } +def S_BARRIER_WAIT : SOPP_Pseudo <"s_barrier_wait", (ins i16imm:$simm16), "$simm16", + [(int_amdgcn_s_barrier_wait timm:$simm16)]> { + let SchedRW = [WriteBarrier]; + let isConvergent = 1; +} + +def S_BARRIER_LEAVE : SOPP_Pseudo <"s_barrier_leave", (ins), "", + [(set SCC, (int_amdgcn_s_barrier_leave))]> { + let SchedRW = [WriteBarrier]; + let simm16 = 0; + let fixed_imm = 1; + let isConvergent = 1; + let Defs = [SCC]; +} + def S_WAKEUP : SOPP_Pseudo <"s_wakeup", (ins) > { let SubtargetPredicate = isGFX8Plus; let simm16 = 0; @@ -1887,6 +1985,18 @@ defm S_SWAPPC_B64 : SOP1_Real_gfx11_gfx12<0x049>; defm S_RFE_B64 : SOP1_Real_gfx11_gfx12<0x04a>; defm S_SENDMSG_RTN_B32 : SOP1_Real_gfx11_gfx12<0x04c>; defm S_SENDMSG_RTN_B64 : SOP1_Real_gfx11_gfx12<0x04d>; +defm S_BARRIER_SIGNAL_M0 : SOP1_M0_Real_gfx12<0x04e>; +defm S_BARRIER_SIGNAL_ISFIRST_M0 : SOP1_M0_Real_gfx12<0x04f>; +defm S_GET_BARRIER_STATE_M0 : SOP1_M0_Real_gfx12<0x050>; +defm S_BARRIER_INIT_M0 : SOP1_M0_Real_gfx12<0x051>; +defm S_BARRIER_JOIN_M0 : SOP1_M0_Real_gfx12<0x052>; +defm S_WAKEUP_BARRIER_M0 : SOP1_M0_Real_gfx12<0x057>; +defm S_BARRIER_SIGNAL_IMM : SOP1_Real_gfx12<0x04e>; +defm S_BARRIER_SIGNAL_ISFIRST_IMM : SOP1_Real_gfx12<0x04f>; +defm S_GET_BARRIER_STATE_IMM : SOP1_Real_gfx12<0x050>; +defm S_BARRIER_INIT_IMM : SOP1_Real_gfx12<0x051>; +defm S_BARRIER_JOIN_IMM : SOP1_Real_gfx12<0x052>; +defm S_WAKEUP_BARRIER_IMM : SOP1_Real_gfx12<0x057>; //===----------------------------------------------------------------------===// // SOP1 - GFX1150, GFX12 @@ -2378,6 +2488,8 @@ multiclass SOPP_Real_32_Renamed_gfx12 op, SOPP_Pseudo backing_pseudo, st } defm S_WAIT_ALU : SOPP_Real_32_Renamed_gfx12<0x008, S_WAITCNT_DEPCTR, "s_wait_alu">; +defm S_BARRIER_WAIT : SOPP_Real_32_gfx12<0x014>; +defm S_BARRIER_LEAVE : SOPP_Real_32_gfx12<0x015>; //===----------------------------------------------------------------------===// // SOPP - GFX11, GFX12. diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index bd333ed2cca71..d63b0d6a6e78d 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -1238,6 +1238,7 @@ inline unsigned getOperandSize(const MCOperandInfo &OpInfo) { case AMDGPU::OPERAND_REG_INLINE_C_V2FP32: case AMDGPU::OPERAND_KIMM32: case AMDGPU::OPERAND_KIMM16: // mandatory literal is always size 4 + case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32: return 4; case AMDGPU::OPERAND_REG_IMM_INT64: diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp index cbdbf1c16f9f0..25e628e5cbc55 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp @@ -74,6 +74,16 @@ bool isReallyAClobber(const Value *Ptr, MemoryDef *Def, AAResults *AA) { if (const IntrinsicInst *II = dyn_cast(DefInst)) { switch (II->getIntrinsicID()) { case Intrinsic::amdgcn_s_barrier: + case Intrinsic::amdgcn_s_barrier_signal: + case Intrinsic::amdgcn_s_barrier_signal_var: + case Intrinsic::amdgcn_s_barrier_signal_isfirst: + case Intrinsic::amdgcn_s_barrier_signal_isfirst_var: + case Intrinsic::amdgcn_s_barrier_init: + case Intrinsic::amdgcn_s_barrier_join: + case Intrinsic::amdgcn_s_barrier_wait: + case Intrinsic::amdgcn_s_barrier_leave: + case Intrinsic::amdgcn_s_get_barrier_state: + case Intrinsic::amdgcn_s_wakeup_barrier: case Intrinsic::amdgcn_wave_barrier: case Intrinsic::amdgcn_sched_barrier: case Intrinsic::amdgcn_sched_group_barrier: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll index 48c4e0276edda..4e65b37633949 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll @@ -3,6 +3,9 @@ ; RUN: llc -march=amdgcn -mattr=+auto-waitcnt-before-barrier -verify-machineinstrs < %s | FileCheck --check-prefix=VARIANT1 %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=VARIANT2 %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=+auto-waitcnt-before-barrier -verify-machineinstrs < %s | FileCheck --check-prefix=VARIANT3 %s +; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefix=VARIANT4 %s +; RUN: llc -march=amdgcn -mcpu=gfx1200 -mattr=+auto-waitcnt-before-barrier -verify-machineinstrs < %s | FileCheck --check-prefix=VARIANT5 %s +; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefix=VARIANT6 %s define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 { ; VARIANT0-LABEL: test_barrier: @@ -85,6 +88,80 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 { ; VARIANT3-NEXT: s_waitcnt vmcnt(0) ; VARIANT3-NEXT: global_store_dword v2, v0, s[2:3] ; VARIANT3-NEXT: s_endpgm +; +; VARIANT4-LABEL: test_barrier: +; VARIANT4: ; %bb.0: ; %entry +; VARIANT4-NEXT: s_clause 0x1 +; VARIANT4-NEXT: s_load_b32 s2, s[0:1], 0x2c +; VARIANT4-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; VARIANT4-NEXT: v_lshlrev_b32_e32 v3, 2, v0 +; VARIANT4-NEXT: s_waitcnt lgkmcnt(0) +; VARIANT4-NEXT: v_xad_u32 v1, v0, -1, s2 +; VARIANT4-NEXT: global_store_b32 v3, v0, s[0:1] +; VARIANT4-NEXT: s_barrier_signal -1 +; VARIANT4-NEXT: s_barrier_wait -1 +; VARIANT4-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; VARIANT4-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; VARIANT4-NEXT: v_lshlrev_b64_e32 v[1:2], 2, v[1:2] +; VARIANT4-NEXT: v_add_co_u32 v1, vcc_lo, s0, v1 +; VARIANT4-NEXT: s_delay_alu instid0(VALU_DEP_2) +; VARIANT4-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, s1, v2, vcc_lo +; VARIANT4-NEXT: global_load_b32 v0, v[1:2], off +; VARIANT4-NEXT: s_waitcnt vmcnt(0) +; VARIANT4-NEXT: global_store_b32 v3, v0, s[0:1] +; VARIANT4-NEXT: s_nop 0 +; VARIANT4-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; VARIANT4-NEXT: s_endpgm +; +; VARIANT5-LABEL: test_barrier: +; VARIANT5: ; %bb.0: ; %entry +; VARIANT5-NEXT: s_clause 0x1 +; VARIANT5-NEXT: s_load_b32 s2, s[0:1], 0x2c +; VARIANT5-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; VARIANT5-NEXT: v_lshlrev_b32_e32 v3, 2, v0 +; VARIANT5-NEXT: s_waitcnt lgkmcnt(0) +; VARIANT5-NEXT: v_xad_u32 v1, v0, -1, s2 +; VARIANT5-NEXT: global_store_b32 v3, v0, s[0:1] +; VARIANT5-NEXT: s_barrier_signal -1 +; VARIANT5-NEXT: s_barrier_wait -1 +; VARIANT5-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; VARIANT5-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; VARIANT5-NEXT: v_lshlrev_b64_e32 v[1:2], 2, v[1:2] +; VARIANT5-NEXT: v_add_co_u32 v1, vcc_lo, s0, v1 +; VARIANT5-NEXT: s_delay_alu instid0(VALU_DEP_2) +; VARIANT5-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, s1, v2, vcc_lo +; VARIANT5-NEXT: global_load_b32 v0, v[1:2], off +; VARIANT5-NEXT: s_waitcnt vmcnt(0) +; VARIANT5-NEXT: global_store_b32 v3, v0, s[0:1] +; VARIANT5-NEXT: s_nop 0 +; VARIANT5-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; VARIANT5-NEXT: s_endpgm +; +; VARIANT6-LABEL: test_barrier: +; VARIANT6: ; %bb.0: ; %entry +; VARIANT6-NEXT: s_clause 0x1 +; VARIANT6-NEXT: s_load_b32 s2, s[0:1], 0x2c +; VARIANT6-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; VARIANT6-NEXT: v_lshlrev_b32_e32 v5, 2, v0 +; VARIANT6-NEXT: s_waitcnt lgkmcnt(0) +; VARIANT6-NEXT: s_sub_co_i32 s2, s2, 1 +; VARIANT6-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0 +; VARIANT6-NEXT: v_sub_nc_u32_e32 v1, s2, v0 +; VARIANT6-NEXT: global_store_b32 v5, v0, s[0:1] +; VARIANT6-NEXT: s_barrier_signal -1 +; VARIANT6-NEXT: s_barrier_wait -1 +; VARIANT6-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; VARIANT6-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; VARIANT6-NEXT: v_lshlrev_b64_e32 v[1:2], 2, v[1:2] +; VARIANT6-NEXT: v_add_co_u32 v1, vcc_lo, v3, v1 +; VARIANT6-NEXT: s_delay_alu instid0(VALU_DEP_2) +; VARIANT6-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v4, v2, vcc_lo +; VARIANT6-NEXT: global_load_b32 v0, v[1:2], off +; VARIANT6-NEXT: s_waitcnt vmcnt(0) +; VARIANT6-NEXT: global_store_b32 v5, v0, s[0:1] +; VARIANT6-NEXT: s_nop 0 +; VARIANT6-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; VARIANT6-NEXT: s_endpgm entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll new file mode 100644 index 0000000000000..1ad3e58ce7fc3 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll @@ -0,0 +1,1366 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s +; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GLOBAL-ISEL %s + +define amdgpu_kernel void @test1_s_barrier_signal(ptr addrspace(1) %out) #0 { +; GCN-LABEL: test1_s_barrier_signal: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 +; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: global_store_b32 v3, v2, s[0:1] +; GCN-NEXT: s_barrier_signal -1 +; GCN-NEXT: s_barrier_wait -1 +; GCN-NEXT: global_store_b32 v3, v0, s[0:1] +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GCN-NEXT: s_endpgm +; +; GLOBAL-ISEL-LABEL: test1_s_barrier_signal: +; GLOBAL-ISEL: ; %bb.0: ; %entry +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0) +; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] +; GLOBAL-ISEL-NEXT: s_barrier_signal -1 +; GLOBAL-ISEL-NEXT: s_barrier_wait -1 +; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1] +; GLOBAL-ISEL-NEXT: s_nop 0 +; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GLOBAL-ISEL-NEXT: s_endpgm +entry: + %tmp = call i32 @llvm.amdgcn.workitem.id.x() + %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp + store i32 0, ptr addrspace(1) %tmp1 + call void @llvm.amdgcn.s.barrier.signal(i32 -1) + call void @llvm.amdgcn.s.barrier.wait(i16 -1) + %tmp3 = mul i32 %tmp, %tmp + %tmp4 = sub i32 %tmp3, %tmp + store i32 %tmp4, ptr addrspace(1) %tmp1 + ret void +} + +define amdgpu_kernel void @test2_s_barrier_signal(ptr addrspace(1) %out) #0 { +; GCN-LABEL: test2_s_barrier_signal: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 +; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: global_store_b32 v3, v2, s[0:1] +; GCN-NEXT: s_barrier_signal 1 +; GCN-NEXT: s_barrier_wait 1 +; GCN-NEXT: global_store_b32 v3, v0, s[0:1] +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GCN-NEXT: s_endpgm +; +; GLOBAL-ISEL-LABEL: test2_s_barrier_signal: +; GLOBAL-ISEL: ; %bb.0: ; %entry +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0) +; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] +; GLOBAL-ISEL-NEXT: s_barrier_signal 1 +; GLOBAL-ISEL-NEXT: s_barrier_wait 1 +; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1] +; GLOBAL-ISEL-NEXT: s_nop 0 +; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GLOBAL-ISEL-NEXT: s_endpgm +entry: + %tmp = call i32 @llvm.amdgcn.workitem.id.x() + %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp + store i32 0, ptr addrspace(1) %tmp1 + call void @llvm.amdgcn.s.barrier.signal(i32 1) + call void @llvm.amdgcn.s.barrier.wait(i16 1) + %tmp3 = mul i32 %tmp, %tmp + %tmp4 = sub i32 %tmp3, %tmp + store i32 %tmp4, ptr addrspace(1) %tmp1 + ret void +} + +define amdgpu_kernel void @test3_s_barrier_signal(ptr addrspace(1) %out) #0 { +; GCN-LABEL: test3_s_barrier_signal: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 +; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: global_store_b32 v3, v2, s[0:1] +; GCN-NEXT: s_barrier_signal 0 +; GCN-NEXT: s_barrier_wait 0 +; GCN-NEXT: global_store_b32 v3, v0, s[0:1] +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GCN-NEXT: s_endpgm +; +; GLOBAL-ISEL-LABEL: test3_s_barrier_signal: +; GLOBAL-ISEL: ; %bb.0: ; %entry +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0) +; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] +; GLOBAL-ISEL-NEXT: s_barrier_signal 0 +; GLOBAL-ISEL-NEXT: s_barrier_wait 0 +; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1] +; GLOBAL-ISEL-NEXT: s_nop 0 +; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GLOBAL-ISEL-NEXT: s_endpgm +entry: + %tmp = call i32 @llvm.amdgcn.workitem.id.x() + %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp + store i32 0, ptr addrspace(1) %tmp1 + call void @llvm.amdgcn.s.barrier.signal(i32 0) + call void @llvm.amdgcn.s.barrier.wait(i16 0) + %tmp3 = mul i32 %tmp, %tmp + %tmp4 = sub i32 %tmp3, %tmp + store i32 %tmp4, ptr addrspace(1) %tmp1 + ret void +} + +define amdgpu_kernel void @test1_s_barrier_signal_var(ptr addrspace(1) %out) #0 { +; GCN-LABEL: test1_s_barrier_signal_var: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GCN-NEXT: v_mul_u32_u24_e32 v2, v0, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 2, v0 +; GCN-NEXT: s_mov_b32 m0, 1 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GCN-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: global_store_b32 v3, v1, s[0:1] +; GCN-NEXT: s_barrier_signal m0 +; GCN-NEXT: s_barrier_wait 1 +; GCN-NEXT: global_store_b32 v3, v0, s[0:1] +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GCN-NEXT: s_endpgm +; +; GLOBAL-ISEL-LABEL: test1_s_barrier_signal_var: +; GLOBAL-ISEL: ; %bb.0: ; %entry +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_mov_b32 m0, 1 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0) +; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] +; GLOBAL-ISEL-NEXT: s_barrier_signal m0 +; GLOBAL-ISEL-NEXT: s_barrier_wait 1 +; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1] +; GLOBAL-ISEL-NEXT: s_nop 0 +; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GLOBAL-ISEL-NEXT: s_endpgm +entry: + %tmp = call i32 @llvm.amdgcn.workitem.id.x() + %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp + store i32 0, ptr addrspace(1) %tmp1 + call void @llvm.amdgcn.s.barrier.signal.var(i32 1) + call void @llvm.amdgcn.s.barrier.wait(i16 1) + %tmp3 = mul i32 %tmp, %tmp + %tmp4 = sub i32 %tmp3, %tmp + store i32 %tmp4, ptr addrspace(1) %tmp1 + ret void +} + +define void @test2_s_barrier_signal_var(i32 %arg) { +; GCN-LABEL: test2_s_barrier_signal_var: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_readfirstlane_b32 s0, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GCN-NEXT: s_mov_b32 m0, s0 +; GCN-NEXT: s_barrier_signal m0 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GLOBAL-ISEL-LABEL: test2_s_barrier_signal_var: +; GLOBAL-ISEL: ; %bb.0: +; GLOBAL-ISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GLOBAL-ISEL-NEXT: v_readfirstlane_b32 m0, v0 +; GLOBAL-ISEL-NEXT: s_barrier_signal m0 +; GLOBAL-ISEL-NEXT: s_setpc_b64 s[30:31] + call void @llvm.amdgcn.s.barrier.signal.var(i32 %arg) + ret void +} + +define amdgpu_kernel void @test1_s_barrier_signal_isfirst(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) #0 { +; GCN-LABEL: test1_s_barrier_signal_isfirst: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: global_store_b32 v0, v1, s[6:7] +; GCN-NEXT: s_barrier_signal_isfirst -1 +; GCN-NEXT: s_cselect_b32 s3, s3, s5 +; GCN-NEXT: s_cselect_b32 s2, s2, s4 +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_load_b32 v2, v1, s[0:1] +; GCN-NEXT: global_load_b32 v1, v1, s[2:3] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_lo_u32 v1, v1, v2 +; GCN-NEXT: global_store_b32 v0, v1, s[6:7] +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GCN-NEXT: s_endpgm +; +; GLOBAL-ISEL-LABEL: test1_s_barrier_signal_isfirst: +; GLOBAL-ISEL: ; %bb.0: ; %entry +; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0) +; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7] +; GLOBAL-ISEL-NEXT: s_barrier_signal_isfirst -1 +; GLOBAL-ISEL-NEXT: s_cselect_b32 s8, 1, 0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GLOBAL-ISEL-NEXT: s_and_b32 s8, s8, 1 +; GLOBAL-ISEL-NEXT: s_cmp_lg_u32 s8, 0 +; GLOBAL-ISEL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] +; GLOBAL-ISEL-NEXT: s_clause 0x1 +; GLOBAL-ISEL-NEXT: global_load_b32 v2, v1, s[0:1] +; GLOBAL-ISEL-NEXT: global_load_b32 v1, v1, s[2:3] +; GLOBAL-ISEL-NEXT: s_waitcnt vmcnt(0) +; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v1, v2 +; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7] +; GLOBAL-ISEL-NEXT: s_nop 0 +; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GLOBAL-ISEL-NEXT: s_endpgm +entry: + %tmp = call i32 @llvm.amdgcn.workitem.id.x() + %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp + store i32 0, ptr addrspace(1) %tmp1 + %isfirst = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1) + %0 = load i32, ptr addrspace(1) %a, align 4 + %b.c = select i1 %isfirst, ptr addrspace(1) %b, ptr addrspace(1) %c + %1 = load i32, ptr addrspace(1) %b.c, align 4 + %mul1 = mul nsw i32 %1, %0 + store i32 %mul1, ptr addrspace(1) %tmp1 + ret void +} + +define amdgpu_kernel void @test2_s_barrier_signal_isfirst(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) #0 { +; GCN-LABEL: test2_s_barrier_signal_isfirst: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: global_store_b32 v0, v1, s[6:7] +; GCN-NEXT: s_barrier_signal_isfirst 1 +; GCN-NEXT: s_cselect_b32 s3, s3, s5 +; GCN-NEXT: s_cselect_b32 s2, s2, s4 +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_load_b32 v2, v1, s[0:1] +; GCN-NEXT: global_load_b32 v1, v1, s[2:3] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_lo_u32 v1, v1, v2 +; GCN-NEXT: global_store_b32 v0, v1, s[6:7] +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GCN-NEXT: s_endpgm +; +; GLOBAL-ISEL-LABEL: test2_s_barrier_signal_isfirst: +; GLOBAL-ISEL: ; %bb.0: ; %entry +; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0) +; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7] +; GLOBAL-ISEL-NEXT: s_barrier_signal_isfirst 1 +; GLOBAL-ISEL-NEXT: s_cselect_b32 s8, 1, 0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GLOBAL-ISEL-NEXT: s_and_b32 s8, s8, 1 +; GLOBAL-ISEL-NEXT: s_cmp_lg_u32 s8, 0 +; GLOBAL-ISEL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] +; GLOBAL-ISEL-NEXT: s_clause 0x1 +; GLOBAL-ISEL-NEXT: global_load_b32 v2, v1, s[0:1] +; GLOBAL-ISEL-NEXT: global_load_b32 v1, v1, s[2:3] +; GLOBAL-ISEL-NEXT: s_waitcnt vmcnt(0) +; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v1, v2 +; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7] +; GLOBAL-ISEL-NEXT: s_nop 0 +; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GLOBAL-ISEL-NEXT: s_endpgm +entry: + %tmp = call i32 @llvm.amdgcn.workitem.id.x() + %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp + store i32 0, ptr addrspace(1) %tmp1 + %isfirst = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 1) + %0 = load i32, ptr addrspace(1) %a, align 4 + %b.c = select i1 %isfirst, ptr addrspace(1) %b, ptr addrspace(1) %c + %1 = load i32, ptr addrspace(1) %b.c, align 4 + %mul1 = mul nsw i32 %1, %0 + store i32 %mul1, ptr addrspace(1) %tmp1 + ret void +} + +define amdgpu_kernel void @test3_s_barrier_signal_isfirst(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) #0 { +; GCN-LABEL: test3_s_barrier_signal_isfirst: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: global_store_b32 v0, v1, s[6:7] +; GCN-NEXT: s_barrier_signal_isfirst 1 +; GCN-NEXT: s_cselect_b32 s3, s3, s5 +; GCN-NEXT: s_cselect_b32 s2, s2, s4 +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_load_b32 v2, v1, s[0:1] +; GCN-NEXT: global_load_b32 v1, v1, s[2:3] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_lo_u32 v1, v1, v2 +; GCN-NEXT: global_store_b32 v0, v1, s[6:7] +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GCN-NEXT: s_endpgm +; +; GLOBAL-ISEL-LABEL: test3_s_barrier_signal_isfirst: +; GLOBAL-ISEL: ; %bb.0: ; %entry +; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0) +; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7] +; GLOBAL-ISEL-NEXT: s_barrier_signal_isfirst 1 +; GLOBAL-ISEL-NEXT: s_cselect_b32 s8, 1, 0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GLOBAL-ISEL-NEXT: s_and_b32 s8, s8, 1 +; GLOBAL-ISEL-NEXT: s_cmp_lg_u32 s8, 0 +; GLOBAL-ISEL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] +; GLOBAL-ISEL-NEXT: s_clause 0x1 +; GLOBAL-ISEL-NEXT: global_load_b32 v2, v1, s[0:1] +; GLOBAL-ISEL-NEXT: global_load_b32 v1, v1, s[2:3] +; GLOBAL-ISEL-NEXT: s_waitcnt vmcnt(0) +; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v1, v2 +; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7] +; GLOBAL-ISEL-NEXT: s_nop 0 +; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GLOBAL-ISEL-NEXT: s_endpgm +entry: + %tmp = call i32 @llvm.amdgcn.workitem.id.x() + %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp + store i32 0, ptr addrspace(1) %tmp1 + %isfirst = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 1) + %0 = load i32, ptr addrspace(1) %a, align 4 + %b.c = select i1 %isfirst, ptr addrspace(1) %b, ptr addrspace(1) %c + %1 = load i32, ptr addrspace(1) %b.c, align 4 + %mul1 = mul nsw i32 %1, %0 + store i32 %mul1, ptr addrspace(1) %tmp1 + ret void +} + +define amdgpu_kernel void @test1_s_barrier_signal_isfirst_var(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) #0 { +; GCN-LABEL: test1_s_barrier_signal_isfirst_var: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GCN-NEXT: s_mov_b32 m0, 1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: global_store_b32 v0, v1, s[6:7] +; GCN-NEXT: s_barrier_signal_isfirst m0 +; GCN-NEXT: s_cselect_b32 s3, s3, s5 +; GCN-NEXT: s_cselect_b32 s2, s2, s4 +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_load_b32 v2, v1, s[0:1] +; GCN-NEXT: global_load_b32 v1, v1, s[2:3] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_lo_u32 v1, v1, v2 +; GCN-NEXT: global_store_b32 v0, v1, s[6:7] +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GCN-NEXT: s_endpgm +; +; GLOBAL-ISEL-LABEL: test1_s_barrier_signal_isfirst_var: +; GLOBAL-ISEL: ; %bb.0: ; %entry +; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GLOBAL-ISEL-NEXT: s_mov_b32 m0, 1 +; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0) +; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7] +; GLOBAL-ISEL-NEXT: s_barrier_signal_isfirst m0 +; GLOBAL-ISEL-NEXT: s_cselect_b32 s8, 1, 0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GLOBAL-ISEL-NEXT: s_and_b32 s8, s8, 1 +; GLOBAL-ISEL-NEXT: s_cmp_lg_u32 s8, 0 +; GLOBAL-ISEL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] +; GLOBAL-ISEL-NEXT: s_clause 0x1 +; GLOBAL-ISEL-NEXT: global_load_b32 v2, v1, s[0:1] +; GLOBAL-ISEL-NEXT: global_load_b32 v1, v1, s[2:3] +; GLOBAL-ISEL-NEXT: s_waitcnt vmcnt(0) +; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v1, v2 +; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7] +; GLOBAL-ISEL-NEXT: s_nop 0 +; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GLOBAL-ISEL-NEXT: s_endpgm +entry: + %tmp = call i32 @llvm.amdgcn.workitem.id.x() + %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp + store i32 0, ptr addrspace(1) %tmp1 + %isfirst = call i1 @llvm.amdgcn.s.barrier.signal.isfirst.var(i32 1) + %0 = load i32, ptr addrspace(1) %a, align 4 + %b.c = select i1 %isfirst, ptr addrspace(1) %b, ptr addrspace(1) %c + %1 = load i32, ptr addrspace(1) %b.c, align 4 + %mul1 = mul nsw i32 %1, %0 + store i32 %mul1, ptr addrspace(1) %tmp1 + ret void +} + +define void @test2_s_barrier_signal_isfirst_var(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, i32 %arg, ptr addrspace(1) %out) { +; GCN-LABEL: test2_s_barrier_signal_isfirst_var: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_and_b32 v9, 0x3ff, v31 +; GCN-NEXT: v_readfirstlane_b32 s0, v6 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GCN-NEXT: v_lshlrev_b32_e32 v9, 2, v9 +; GCN-NEXT: s_mov_b32 m0, s0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GCN-NEXT: v_add_co_u32 v7, vcc_lo, v7, v9 +; GCN-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v8, vcc_lo +; GCN-NEXT: global_store_b32 v[7:8], v10, off +; GCN-NEXT: s_barrier_signal_isfirst m0 +; GCN-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GCN-NEXT: v_dual_cndmask_b32 v2, v4, v2 :: v_dual_cndmask_b32 v3, v5, v3 +; GCN-NEXT: global_load_b32 v0, v[0:1], off +; GCN-NEXT: global_load_b32 v1, v[2:3], off +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_lo_u32 v0, v1, v0 +; GCN-NEXT: global_store_b32 v[7:8], v0, off +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GLOBAL-ISEL-LABEL: test2_s_barrier_signal_isfirst_var: +; GLOBAL-ISEL: ; %bb.0: +; GLOBAL-ISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GLOBAL-ISEL-NEXT: v_and_b32_e32 v9, 0x3ff, v31 +; GLOBAL-ISEL-NEXT: v_readfirstlane_b32 m0, v6 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GLOBAL-ISEL-NEXT: v_lshlrev_b32_e32 v9, 2, v9 +; GLOBAL-ISEL-NEXT: v_add_co_u32 v7, vcc_lo, v7, v9 +; GLOBAL-ISEL-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v8, vcc_lo +; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v9, 0 +; GLOBAL-ISEL-NEXT: global_store_b32 v[7:8], v9, off +; GLOBAL-ISEL-NEXT: s_barrier_signal_isfirst m0 +; GLOBAL-ISEL-NEXT: s_cselect_b32 s0, 1, 0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GLOBAL-ISEL-NEXT: s_and_b32 s0, 1, s0 +; GLOBAL-ISEL-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GLOBAL-ISEL-NEXT: v_dual_cndmask_b32 v2, v4, v2 :: v_dual_cndmask_b32 v3, v5, v3 +; GLOBAL-ISEL-NEXT: global_load_b32 v0, v[0:1], off +; GLOBAL-ISEL-NEXT: global_load_b32 v1, v[2:3], off +; GLOBAL-ISEL-NEXT: s_waitcnt vmcnt(0) +; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v0, v1, v0 +; GLOBAL-ISEL-NEXT: global_store_b32 v[7:8], v0, off +; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0) +; GLOBAL-ISEL-NEXT: s_setpc_b64 s[30:31] + %tmp = call i32 @llvm.amdgcn.workitem.id.x() + %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp + store i32 0, ptr addrspace(1) %tmp1 + %isfirst = call i1 @llvm.amdgcn.s.barrier.signal.isfirst.var(i32 %arg) + %1 = load i32, ptr addrspace(1) %a, align 4 + %b.c = select i1 %isfirst, ptr addrspace(1) %b, ptr addrspace(1) %c + %2 = load i32, ptr addrspace(1) %b.c, align 4 + %mul1 = mul nsw i32 %2, %1 + store i32 %mul1, ptr addrspace(1) %tmp1 + ret void +} + +define amdgpu_kernel void @test1_s_barrier_init(ptr addrspace(1) %out, i32 %mbrCnt) #0 { +; GCN-LABEL: test1_s_barrier_init: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 +; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshl_b32 s2, s2, 16 +; GCN-NEXT: global_store_b32 v3, v2, s[0:1] +; GCN-NEXT: s_mov_b32 m0, s2 +; GCN-NEXT: s_barrier_init -1 +; GCN-NEXT: global_store_b32 v3, v0, s[0:1] +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GCN-NEXT: s_endpgm +; +; GLOBAL-ISEL-LABEL: test1_s_barrier_init: +; GLOBAL-ISEL: ; %bb.0: ; %entry +; GLOBAL-ISEL-NEXT: s_clause 0x1 +; GLOBAL-ISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0) +; GLOBAL-ISEL-NEXT: s_lshl_b32 m0, 16, s2 +; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] +; GLOBAL-ISEL-NEXT: s_barrier_init -1 +; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1] +; GLOBAL-ISEL-NEXT: s_nop 0 +; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GLOBAL-ISEL-NEXT: s_endpgm +entry: + %tmp = call i32 @llvm.amdgcn.workitem.id.x() + %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp + store i32 0, ptr addrspace(1) %tmp1 + call void @llvm.amdgcn.s.barrier.init(i32 -1, i32 %mbrCnt) + %tmp2 = mul i32 %tmp, %tmp + %tmp3 = sub i32 %tmp2, %tmp + store i32 %tmp3, ptr addrspace(1) %tmp1 + ret void +} + +define amdgpu_kernel void @test2_s_barrier_init(ptr addrspace(1) %out, i32 %mbrCnt) #0 { +; GCN-LABEL: test2_s_barrier_init: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 +; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshl_b32 s2, s2, 16 +; GCN-NEXT: global_store_b32 v3, v2, s[0:1] +; GCN-NEXT: s_mov_b32 m0, s2 +; GCN-NEXT: s_barrier_init 1 +; GCN-NEXT: global_store_b32 v3, v0, s[0:1] +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GCN-NEXT: s_endpgm +; +; GLOBAL-ISEL-LABEL: test2_s_barrier_init: +; GLOBAL-ISEL: ; %bb.0: ; %entry +; GLOBAL-ISEL-NEXT: s_clause 0x1 +; GLOBAL-ISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0) +; GLOBAL-ISEL-NEXT: s_lshl_b32 m0, 16, s2 +; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] +; GLOBAL-ISEL-NEXT: s_barrier_init 1 +; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1] +; GLOBAL-ISEL-NEXT: s_nop 0 +; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GLOBAL-ISEL-NEXT: s_endpgm +entry: + %tmp = call i32 @llvm.amdgcn.workitem.id.x() + %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp + store i32 0, ptr addrspace(1) %tmp1 + call void @llvm.amdgcn.s.barrier.init(i32 1, i32 %mbrCnt) + %tmp3 = mul i32 %tmp, %tmp + %tmp4 = sub i32 %tmp3, %tmp + store i32 %tmp4, ptr addrspace(1) %tmp1 + ret void +} + +define amdgpu_kernel void @test3_s_barrier_init(ptr addrspace(1) %out, i32 %mbrCnt) #0 { +; GCN-LABEL: test3_s_barrier_init: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 +; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshl_b32 s2, s2, 16 +; GCN-NEXT: global_store_b32 v3, v2, s[0:1] +; GCN-NEXT: s_mov_b32 m0, s2 +; GCN-NEXT: s_barrier_init 0 +; GCN-NEXT: global_store_b32 v3, v0, s[0:1] +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GCN-NEXT: s_endpgm +; +; GLOBAL-ISEL-LABEL: test3_s_barrier_init: +; GLOBAL-ISEL: ; %bb.0: ; %entry +; GLOBAL-ISEL-NEXT: s_clause 0x1 +; GLOBAL-ISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0) +; GLOBAL-ISEL-NEXT: s_lshl_b32 m0, 16, s2 +; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] +; GLOBAL-ISEL-NEXT: s_barrier_init 0 +; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1] +; GLOBAL-ISEL-NEXT: s_nop 0 +; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GLOBAL-ISEL-NEXT: s_endpgm +entry: + %tmp = call i32 @llvm.amdgcn.workitem.id.x() + %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp + store i32 0, ptr addrspace(1) %tmp1 + call void @llvm.amdgcn.s.barrier.init(i32 0, i32 %mbrCnt) + %tmp3 = mul i32 %tmp, %tmp + %tmp4 = sub i32 %tmp3, %tmp + store i32 %tmp4, ptr addrspace(1) %tmp1 + ret void +} + +define amdgpu_kernel void @test4_s_barrier_init(ptr addrspace(1) %out, i32 %bar, i32 %mbrCnt) #0 { +; GCN-LABEL: test4_s_barrier_init: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 +; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshl_b32 s3, s3, 16 +; GCN-NEXT: global_store_b32 v3, v2, s[0:1] +; GCN-NEXT: s_or_b32 s2, s2, s3 +; GCN-NEXT: s_mov_b32 m0, s2 +; GCN-NEXT: s_barrier_init m0 +; GCN-NEXT: global_store_b32 v3, v0, s[0:1] +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GCN-NEXT: s_endpgm +; +; GLOBAL-ISEL-LABEL: test4_s_barrier_init: +; GLOBAL-ISEL: ; %bb.0: ; %entry +; GLOBAL-ISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0) +; GLOBAL-ISEL-NEXT: s_lshl_b32 s3, 16, s3 +; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] +; GLOBAL-ISEL-NEXT: s_or_b32 m0, s2, s3 +; GLOBAL-ISEL-NEXT: s_barrier_init m0 +; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1] +; GLOBAL-ISEL-NEXT: s_nop 0 +; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GLOBAL-ISEL-NEXT: s_endpgm +entry: + %tmp = call i32 @llvm.amdgcn.workitem.id.x() + %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp + store i32 0, ptr addrspace(1) %tmp1 + call void @llvm.amdgcn.s.barrier.init(i32 %bar, i32 %mbrCnt) + %tmp3 = mul i32 %tmp, %tmp + %tmp4 = sub i32 %tmp3, %tmp + store i32 %tmp4, ptr addrspace(1) %tmp1 + ret void +} + +define void @test5_s_barrier_init_m0(i32 %arg1 ,i32 %arg2) { +; GCN-LABEL: test5_s_barrier_init_m0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: v_readfirstlane_b32 s0, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GCN-NEXT: s_mov_b32 m0, s0 +; GCN-NEXT: s_barrier_init m0 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GLOBAL-ISEL-LABEL: test5_s_barrier_init_m0: +; GLOBAL-ISEL: ; %bb.0: +; GLOBAL-ISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GLOBAL-ISEL-NEXT: v_readfirstlane_b32 s0, v1 +; GLOBAL-ISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GLOBAL-ISEL-NEXT: s_lshl_b32 s0, 16, s0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GLOBAL-ISEL-NEXT: s_or_b32 m0, s1, s0 +; GLOBAL-ISEL-NEXT: s_barrier_init m0 +; GLOBAL-ISEL-NEXT: s_setpc_b64 s[30:31] + call void @llvm.amdgcn.s.barrier.init(i32 %arg1, i32 %arg2) + ret void +} + +define amdgpu_kernel void @test1_s_barrier_join(ptr addrspace(1) %out) #0 { +; GCN-LABEL: test1_s_barrier_join: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GCN-NEXT: s_barrier_join -1 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: global_store_b32 v2, v0, s[0:1] +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GCN-NEXT: s_endpgm +; +; GLOBAL-ISEL-LABEL: test1_s_barrier_join: +; GLOBAL-ISEL: ; %bb.0: ; %entry +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0) +; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] +; GLOBAL-ISEL-NEXT: s_barrier_join -1 +; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1] +; GLOBAL-ISEL-NEXT: s_nop 0 +; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GLOBAL-ISEL-NEXT: s_endpgm +entry: + %tmp = call i32 @llvm.amdgcn.workitem.id.x() + %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp + store i32 0, ptr addrspace(1) %tmp1 + call void @llvm.amdgcn.s.barrier.join(i32 -1) + %tmp3 = mul i32 %tmp, %tmp + %tmp4 = sub i32 %tmp3, %tmp + store i32 %tmp4, ptr addrspace(1) %tmp1 + ret void +} + +define amdgpu_kernel void @test2_s_barrier_join(ptr addrspace(1) %out) #0 { +; GCN-LABEL: test2_s_barrier_join: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GCN-NEXT: s_barrier_join 1 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: global_store_b32 v2, v0, s[0:1] +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GCN-NEXT: s_endpgm +; +; GLOBAL-ISEL-LABEL: test2_s_barrier_join: +; GLOBAL-ISEL: ; %bb.0: ; %entry +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0) +; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] +; GLOBAL-ISEL-NEXT: s_barrier_join 1 +; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1] +; GLOBAL-ISEL-NEXT: s_nop 0 +; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GLOBAL-ISEL-NEXT: s_endpgm +entry: + %tmp = call i32 @llvm.amdgcn.workitem.id.x() + %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp + store i32 0, ptr addrspace(1) %tmp1 + call void @llvm.amdgcn.s.barrier.join(i32 1) + %tmp3 = mul i32 %tmp, %tmp + %tmp4 = sub i32 %tmp3, %tmp + store i32 %tmp4, ptr addrspace(1) %tmp1 + ret void +} + +define amdgpu_kernel void @test3_s_barrier_join(ptr addrspace(1) %out) #0 { +; GCN-LABEL: test3_s_barrier_join: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GCN-NEXT: s_barrier_join 0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: global_store_b32 v2, v0, s[0:1] +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GCN-NEXT: s_endpgm +; +; GLOBAL-ISEL-LABEL: test3_s_barrier_join: +; GLOBAL-ISEL: ; %bb.0: ; %entry +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0) +; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] +; GLOBAL-ISEL-NEXT: s_barrier_join 0 +; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1] +; GLOBAL-ISEL-NEXT: s_nop 0 +; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GLOBAL-ISEL-NEXT: s_endpgm +entry: + %tmp = call i32 @llvm.amdgcn.workitem.id.x() + %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp + store i32 0, ptr addrspace(1) %tmp1 + call void @llvm.amdgcn.s.barrier.join(i32 0) + %tmp3 = mul i32 %tmp, %tmp + %tmp4 = sub i32 %tmp3, %tmp + store i32 %tmp4, ptr addrspace(1) %tmp1 + ret void +} + +define amdgpu_kernel void @test4_s_barrier_join_m0(ptr addrspace(1) %out, i32 %bar) #0 { +; GCN-LABEL: test4_s_barrier_join_m0: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_b32 s0, s[0:1], 0x2c +; GCN-NEXT: v_mul_u32_u24_e32 v2, v0, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 2, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GCN-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: global_store_b32 v3, v1, s[2:3] +; GCN-NEXT: s_mov_b32 m0, s0 +; GCN-NEXT: s_barrier_join m0 +; GCN-NEXT: global_store_b32 v3, v0, s[2:3] +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GCN-NEXT: s_endpgm +; +; GLOBAL-ISEL-LABEL: test4_s_barrier_join_m0: +; GLOBAL-ISEL: ; %bb.0: ; %entry +; GLOBAL-ISEL-NEXT: s_clause 0x1 +; GLOBAL-ISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 +; GLOBAL-ISEL-NEXT: s_load_b32 s0, s[0:1], 0x2c +; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0) +; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[2:3] +; GLOBAL-ISEL-NEXT: s_mov_b32 m0, s0 +; GLOBAL-ISEL-NEXT: s_barrier_join m0 +; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[2:3] +; GLOBAL-ISEL-NEXT: s_nop 0 +; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GLOBAL-ISEL-NEXT: s_endpgm +entry: + %tmp = call i32 @llvm.amdgcn.workitem.id.x() + %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp + store i32 0, ptr addrspace(1) %tmp1 + call void @llvm.amdgcn.s.barrier.join(i32 %bar) + %tmp3 = mul i32 %tmp, %tmp + %tmp4 = sub i32 %tmp3, %tmp + store i32 %tmp4, ptr addrspace(1) %tmp1 + ret void +} + +define void @test5_s_barrier_join_m0(i32 %arg) { +; GCN-LABEL: test5_s_barrier_join_m0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_readfirstlane_b32 s0, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GCN-NEXT: s_mov_b32 m0, s0 +; GCN-NEXT: s_barrier_join m0 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GLOBAL-ISEL-LABEL: test5_s_barrier_join_m0: +; GLOBAL-ISEL: ; %bb.0: +; GLOBAL-ISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GLOBAL-ISEL-NEXT: v_readfirstlane_b32 m0, v0 +; GLOBAL-ISEL-NEXT: s_barrier_join m0 +; GLOBAL-ISEL-NEXT: s_setpc_b64 s[30:31] + call void @llvm.amdgcn.s.barrier.join(i32 %arg) + ret void +} + +define amdgpu_kernel void @test1_s_barrier_leave(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) #0 { +; GCN-LABEL: test1_s_barrier_leave: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: global_store_b32 v0, v1, s[6:7] +; GCN-NEXT: s_barrier_leave +; GCN-NEXT: s_cselect_b32 s3, s3, s5 +; GCN-NEXT: s_cselect_b32 s2, s2, s4 +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_load_b32 v2, v1, s[0:1] +; GCN-NEXT: global_load_b32 v1, v1, s[2:3] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_lo_u32 v1, v1, v2 +; GCN-NEXT: global_store_b32 v0, v1, s[6:7] +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GCN-NEXT: s_endpgm +; +; GLOBAL-ISEL-LABEL: test1_s_barrier_leave: +; GLOBAL-ISEL: ; %bb.0: ; %entry +; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0) +; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7] +; GLOBAL-ISEL-NEXT: s_barrier_leave +; GLOBAL-ISEL-NEXT: s_cselect_b32 s8, 1, 0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GLOBAL-ISEL-NEXT: s_and_b32 s8, s8, 1 +; GLOBAL-ISEL-NEXT: s_cmp_lg_u32 s8, 0 +; GLOBAL-ISEL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] +; GLOBAL-ISEL-NEXT: s_clause 0x1 +; GLOBAL-ISEL-NEXT: global_load_b32 v2, v1, s[0:1] +; GLOBAL-ISEL-NEXT: global_load_b32 v1, v1, s[2:3] +; GLOBAL-ISEL-NEXT: s_waitcnt vmcnt(0) +; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v1, v2 +; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7] +; GLOBAL-ISEL-NEXT: s_nop 0 +; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GLOBAL-ISEL-NEXT: s_endpgm +entry: + %tmp = call i32 @llvm.amdgcn.workitem.id.x() + %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp + store i32 0, ptr addrspace(1) %tmp1 + %0 = call i1 @llvm.amdgcn.s.barrier.leave() + %1 = load i32, ptr addrspace(1) %a, align 4 + %b.c = select i1 %0, ptr addrspace(1) %b, ptr addrspace(1) %c + %2 = load i32, ptr addrspace(1) %b.c, align 4 + %mul1 = mul nsw i32 %2, %1 + store i32 %mul1, ptr addrspace(1) %tmp1 + ret void +} + +define amdgpu_kernel void @test1_s_wakeup_barrier(ptr addrspace(1) %out) #0 { +; GCN-LABEL: test1_s_wakeup_barrier: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GCN-NEXT: s_wakeup_barrier -1 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: global_store_b32 v2, v0, s[0:1] +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GCN-NEXT: s_endpgm +; +; GLOBAL-ISEL-LABEL: test1_s_wakeup_barrier: +; GLOBAL-ISEL: ; %bb.0: ; %entry +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0) +; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] +; GLOBAL-ISEL-NEXT: s_wakeup_barrier -1 +; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1] +; GLOBAL-ISEL-NEXT: s_nop 0 +; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GLOBAL-ISEL-NEXT: s_endpgm +entry: + %tmp = call i32 @llvm.amdgcn.workitem.id.x() + %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp + store i32 0, ptr addrspace(1) %tmp1 + call void @llvm.amdgcn.s.wakeup.barrier(i32 -1) + %tmp3 = mul i32 %tmp, %tmp + %tmp4 = sub i32 %tmp3, %tmp + store i32 %tmp4, ptr addrspace(1) %tmp1 + ret void +} + +define amdgpu_kernel void @test2_s_wakeup_barrier(ptr addrspace(1) %out) #0 { +; GCN-LABEL: test2_s_wakeup_barrier: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GCN-NEXT: s_wakeup_barrier 1 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: global_store_b32 v2, v0, s[0:1] +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GCN-NEXT: s_endpgm +; +; GLOBAL-ISEL-LABEL: test2_s_wakeup_barrier: +; GLOBAL-ISEL: ; %bb.0: ; %entry +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0) +; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] +; GLOBAL-ISEL-NEXT: s_wakeup_barrier 1 +; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1] +; GLOBAL-ISEL-NEXT: s_nop 0 +; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GLOBAL-ISEL-NEXT: s_endpgm +entry: + %tmp = call i32 @llvm.amdgcn.workitem.id.x() + %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp + store i32 0, ptr addrspace(1) %tmp1 + call void @llvm.amdgcn.s.wakeup.barrier(i32 1) + %tmp3 = mul i32 %tmp, %tmp + %tmp4 = sub i32 %tmp3, %tmp + store i32 %tmp4, ptr addrspace(1) %tmp1 + ret void +} + +define amdgpu_kernel void @test3_s_wakeup_barrier(ptr addrspace(1) %out) #0 { +; GCN-LABEL: test3_s_wakeup_barrier: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GCN-NEXT: s_wakeup_barrier 0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: global_store_b32 v2, v0, s[0:1] +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GCN-NEXT: s_endpgm +; +; GLOBAL-ISEL-LABEL: test3_s_wakeup_barrier: +; GLOBAL-ISEL: ; %bb.0: ; %entry +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0) +; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] +; GLOBAL-ISEL-NEXT: s_wakeup_barrier 0 +; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1] +; GLOBAL-ISEL-NEXT: s_nop 0 +; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GLOBAL-ISEL-NEXT: s_endpgm +entry: + %tmp = call i32 @llvm.amdgcn.workitem.id.x() + %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp + store i32 0, ptr addrspace(1) %tmp1 + call void @llvm.amdgcn.s.wakeup.barrier(i32 0) + %tmp3 = mul i32 %tmp, %tmp + %tmp4 = sub i32 %tmp3, %tmp + store i32 %tmp4, ptr addrspace(1) %tmp1 + ret void +} + +define amdgpu_kernel void @test4_s_wakeup_barrier_m0(ptr addrspace(1) %out, i32 %bar) #0 { +; GCN-LABEL: test4_s_wakeup_barrier_m0: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_b32 s0, s[0:1], 0x2c +; GCN-NEXT: v_mul_u32_u24_e32 v2, v0, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 2, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GCN-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: global_store_b32 v3, v1, s[2:3] +; GCN-NEXT: s_mov_b32 m0, s0 +; GCN-NEXT: s_wakeup_barrier m0 +; GCN-NEXT: global_store_b32 v3, v0, s[2:3] +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GCN-NEXT: s_endpgm +; +; GLOBAL-ISEL-LABEL: test4_s_wakeup_barrier_m0: +; GLOBAL-ISEL: ; %bb.0: ; %entry +; GLOBAL-ISEL-NEXT: s_clause 0x1 +; GLOBAL-ISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 +; GLOBAL-ISEL-NEXT: s_load_b32 s0, s[0:1], 0x2c +; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0) +; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[2:3] +; GLOBAL-ISEL-NEXT: s_mov_b32 m0, s0 +; GLOBAL-ISEL-NEXT: s_wakeup_barrier m0 +; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[2:3] +; GLOBAL-ISEL-NEXT: s_nop 0 +; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GLOBAL-ISEL-NEXT: s_endpgm +entry: + %tmp = call i32 @llvm.amdgcn.workitem.id.x() + %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp + store i32 0, ptr addrspace(1) %tmp1 + call void @llvm.amdgcn.s.wakeup.barrier(i32 %bar) + %tmp3 = mul i32 %tmp, %tmp + %tmp4 = sub i32 %tmp3, %tmp + store i32 %tmp4, ptr addrspace(1) %tmp1 + ret void +} + +define void @test5_s_wakeup_barrier_m0(i32 %arg) { +; GCN-LABEL: test5_s_wakeup_barrier_m0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_readfirstlane_b32 s0, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GCN-NEXT: s_mov_b32 m0, s0 +; GCN-NEXT: s_wakeup_barrier m0 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GLOBAL-ISEL-LABEL: test5_s_wakeup_barrier_m0: +; GLOBAL-ISEL: ; %bb.0: +; GLOBAL-ISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GLOBAL-ISEL-NEXT: v_readfirstlane_b32 m0, v0 +; GLOBAL-ISEL-NEXT: s_wakeup_barrier m0 +; GLOBAL-ISEL-NEXT: s_setpc_b64 s[30:31] + call void @llvm.amdgcn.s.wakeup.barrier(i32 %arg) + ret void +} + +define amdgpu_kernel void @test1_s_get_barrier_state(ptr addrspace(1) %out) #0 { +; GCN-LABEL: test1_s_get_barrier_state: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_get_barrier_state s2, -1 +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_lshlrev_b32 v0, 2, v0 +; GCN-NEXT: global_store_b32 v0, v1, s[0:1] +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GCN-NEXT: s_endpgm +; +; GLOBAL-ISEL-LABEL: test1_s_get_barrier_state: +; GLOBAL-ISEL: ; %bb.0: ; %entry +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0) +; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GLOBAL-ISEL-NEXT: s_get_barrier_state s2, -1 +; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0) +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2) +; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v1, s2 +; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GLOBAL-ISEL-NEXT: s_nop 0 +; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GLOBAL-ISEL-NEXT: s_endpgm +entry: + %tmp = call i32 @llvm.amdgcn.workitem.id.x() + %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp + store i32 0, ptr addrspace(1) %tmp1 + %state = call i32 @llvm.amdgcn.s.get.barrier.state(i32 -1) + store i32 %state, ptr addrspace(1) %tmp1 + ret void +} + +define amdgpu_kernel void @test2_s_get_barrier_state(ptr addrspace(1) %out) #0 { +; GCN-LABEL: test2_s_get_barrier_state: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_get_barrier_state s2, 1 +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_lshlrev_b32 v0, 2, v0 +; GCN-NEXT: global_store_b32 v0, v1, s[0:1] +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GCN-NEXT: s_endpgm +; +; GLOBAL-ISEL-LABEL: test2_s_get_barrier_state: +; GLOBAL-ISEL: ; %bb.0: ; %entry +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0) +; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GLOBAL-ISEL-NEXT: s_get_barrier_state s2, 1 +; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0) +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2) +; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v1, s2 +; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GLOBAL-ISEL-NEXT: s_nop 0 +; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GLOBAL-ISEL-NEXT: s_endpgm +entry: + %tmp = call i32 @llvm.amdgcn.workitem.id.x() + %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp + store i32 0, ptr addrspace(1) %tmp1 + %state = call i32 @llvm.amdgcn.s.get.barrier.state(i32 1) + store i32 %state, ptr addrspace(1) %tmp1 + ret void +} + +define amdgpu_kernel void @test3_s_get_barrier_state(ptr addrspace(1) %out) #0 { +; GCN-LABEL: test3_s_get_barrier_state: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_get_barrier_state s2, 0 +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_lshlrev_b32 v0, 2, v0 +; GCN-NEXT: global_store_b32 v0, v1, s[0:1] +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GCN-NEXT: s_endpgm +; +; GLOBAL-ISEL-LABEL: test3_s_get_barrier_state: +; GLOBAL-ISEL: ; %bb.0: ; %entry +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0) +; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GLOBAL-ISEL-NEXT: s_get_barrier_state s2, 0 +; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0) +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2) +; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v1, s2 +; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GLOBAL-ISEL-NEXT: s_nop 0 +; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GLOBAL-ISEL-NEXT: s_endpgm +entry: + %tmp = call i32 @llvm.amdgcn.workitem.id.x() + %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp + store i32 0, ptr addrspace(1) %tmp1 + %state = call i32 @llvm.amdgcn.s.get.barrier.state(i32 0) + store i32 %state, ptr addrspace(1) %tmp1 + ret void +} + +define amdgpu_kernel void @test4_s_get_barrier_state_m0(ptr addrspace(1) %out, i32 %bar) #0 { +; GCN-LABEL: test4_s_get_barrier_state_m0: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_b32 s0, s[0:1], 0x2c +; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: global_store_b32 v0, v1, s[2:3] +; GCN-NEXT: s_mov_b32 m0, s0 +; GCN-NEXT: s_get_barrier_state s0, m0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_2) +; GCN-NEXT: v_mov_b32_e32 v1, s0 +; GCN-NEXT: global_store_b32 v0, v1, s[2:3] +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GCN-NEXT: s_endpgm +; +; GLOBAL-ISEL-LABEL: test4_s_get_barrier_state_m0: +; GLOBAL-ISEL: ; %bb.0: ; %entry +; GLOBAL-ISEL-NEXT: s_clause 0x1 +; GLOBAL-ISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 +; GLOBAL-ISEL-NEXT: s_load_b32 s0, s[0:1], 0x2c +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0) +; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[2:3] +; GLOBAL-ISEL-NEXT: s_mov_b32 m0, s0 +; GLOBAL-ISEL-NEXT: s_get_barrier_state s0, m0 +; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0) +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2) +; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v1, s0 +; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[2:3] +; GLOBAL-ISEL-NEXT: s_nop 0 +; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GLOBAL-ISEL-NEXT: s_endpgm +entry: + %tmp = call i32 @llvm.amdgcn.workitem.id.x() + %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp + store i32 0, ptr addrspace(1) %tmp1 + %state = call i32 @llvm.amdgcn.s.get.barrier.state(i32 %bar) + store i32 %state, ptr addrspace(1) %tmp1 + ret void +} + +define i32 @test5_s_get_barrier_state_m0(i32 %arg) { +; GCN-LABEL: test5_s_get_barrier_state_m0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_readfirstlane_b32 s0, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_2) +; GCN-NEXT: s_mov_b32 m0, s0 +; GCN-NEXT: s_get_barrier_state s0, m0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GLOBAL-ISEL-LABEL: test5_s_get_barrier_state_m0: +; GLOBAL-ISEL: ; %bb.0: +; GLOBAL-ISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GLOBAL-ISEL-NEXT: v_readfirstlane_b32 m0, v0 +; GLOBAL-ISEL-NEXT: s_get_barrier_state s0, m0 +; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0) +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2) +; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v0, s0 +; GLOBAL-ISEL-NEXT: s_setpc_b64 s[30:31] + %state = call i32 @llvm.amdgcn.s.get.barrier.state(i32 %arg) + ret i32 %state +} + +define amdgpu_kernel void @test_barrier_convert(ptr addrspace(1) %out) #0 { +; GCN-LABEL: test_barrier_convert: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 +; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: global_store_b32 v3, v2, s[0:1] +; GCN-NEXT: s_barrier_signal -1 +; GCN-NEXT: s_barrier_wait -1 +; GCN-NEXT: global_store_b32 v3, v0, s[0:1] +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GCN-NEXT: s_endpgm +; +; GLOBAL-ISEL-LABEL: test_barrier_convert: +; GLOBAL-ISEL: ; %bb.0: ; %entry +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0) +; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] +; GLOBAL-ISEL-NEXT: s_barrier_signal -1 +; GLOBAL-ISEL-NEXT: s_barrier_wait -1 +; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1] +; GLOBAL-ISEL-NEXT: s_nop 0 +; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GLOBAL-ISEL-NEXT: s_endpgm +entry: + %tmp = call i32 @llvm.amdgcn.workitem.id.x() + %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp + store i32 0, ptr addrspace(1) %tmp1 + call void @llvm.amdgcn.s.barrier() + %tmp3 = mul i32 %tmp, %tmp + %tmp4 = sub i32 %tmp3, %tmp + store i32 %tmp4, ptr addrspace(1) %tmp1 + ret void +} +declare void @llvm.amdgcn.s.barrier() #1 +declare void @llvm.amdgcn.s.barrier.wait(i16) #1 +declare void @llvm.amdgcn.s.barrier.signal(i32) #1 +declare void @llvm.amdgcn.s.barrier.signal.var(i32) #1 +declare i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32) #1 +declare i1 @llvm.amdgcn.s.barrier.signal.isfirst.var(i32) #1 +declare void @llvm.amdgcn.s.barrier.init(i32, i32) #1 +declare void @llvm.amdgcn.s.barrier.join(i32) #1 +declare i1 @llvm.amdgcn.s.barrier.leave() #1 +declare void @llvm.amdgcn.s.wakeup.barrier(i32) #1 +declare i32 @llvm.amdgcn.s.get.barrier.state(i32) #1 +declare i32 @llvm.amdgcn.s.get.barrier.state.var(i32) #1 +declare i32 @llvm.amdgcn.workitem.id.x() #2 + +attributes #0 = { nounwind } +attributes #1 = { convergent nounwind } +attributes #2 = { nounwind readnone } diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_sop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_sop1.s index db166e8ffc104..8f2944586ed29 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_sop1.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_sop1.s @@ -684,6 +684,51 @@ s_rndne_f16 s5, 0xfe0b s_rndne_f16 s5, 0x3456 // GFX12: encoding: [0xff,0x6e,0x85,0xbe,0x56,0x34,0x00,0x00] +s_barrier_signal -2 +// GFX12: encoding: [0xc2,0x4e,0x80,0xbe] + +s_barrier_signal -1 +// GFX12: encoding: [0xc1,0x4e,0x80,0xbe] + +s_barrier_signal m0 +// GFX12: encoding: [0x7d,0x4e,0x80,0xbe] + +s_barrier_signal_isfirst -2 +// GFX12: encoding: [0xc2,0x4f,0x80,0xbe] + +s_barrier_signal_isfirst -1 +// GFX12: encoding: [0xc1,0x4f,0x80,0xbe] + +s_barrier_signal_isfirst m0 +// GFX12: encoding: [0x7d,0x4f,0x80,0xbe] + +s_barrier_init -1 +// GFX12: encoding: [0xc1,0x51,0x80,0xbe] + +s_barrier_init -2 +// GFX12: encoding: [0xc2,0x51,0x80,0xbe] + +s_barrier_init m0 +// GFX12: encoding: [0x7d,0x51,0x80,0xbe] + +s_barrier_join -1 +// GFX12: encoding: [0xc1,0x52,0x80,0xbe] + +s_barrier_join -2 +// GFX12: encoding: [0xc2,0x52,0x80,0xbe] + +s_barrier_join m0 +// GFX12: encoding: [0x7d,0x52,0x80,0xbe] + +s_wakeup_barrier 1 +// GFX12: encoding: [0x81,0x57,0x80,0xbe] + +s_wakeup_barrier -1 +// GFX12: encoding: [0xc1,0x57,0x80,0xbe] + +s_wakeup_barrier m0 +// GFX12: encoding: [0x7d,0x57,0x80,0xbe] + s_mov_b32 s0, s1 // GFX12: encoding: [0x01,0x00,0x80,0xbe] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_sopp.s b/llvm/test/MC/AMDGPU/gfx12_asm_sopp.s index 2e9df11d6f5a4..cf78b87a47618 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_sopp.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_sopp.s @@ -33,6 +33,15 @@ s_singleuse_vdst 0xffff s_singleuse_vdst 0x1234 // GFX12: encoding: [0x34,0x12,0x93,0xbf] +s_barrier_wait 0xffff +// GFX12: encoding: [0xff,0xff,0x94,0xbf] + +s_barrier_wait 1 +// GFX12: encoding: [0x01,0x00,0x94,0xbf] + +s_barrier_leave +// GFX12: encoding: [0x00,0x00,0x95,0xbf] + //===----------------------------------------------------------------------===// // s_waitcnt //===----------------------------------------------------------------------===// diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sop1.txt index c061462339b66..1c31ee1e5dd7f 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sop1.txt @@ -684,6 +684,60 @@ # GFX12: s_rndne_f16 s5, 0x3456 ; encoding: [0xff,0x6e,0x85,0xbe,0x56,0x34,0x00,0x00] 0xff,0x6e,0x85,0xbe,0x56,0x34,0x00,0x00 +# GFX12: s_barrier_signal -2 ; encoding: [0xc2,0x4e,0x80,0xbe] +0xc2,0x4e,0x80,0xbe + +# GFX12: s_barrier_signal -1 ; encoding: [0xc1,0x4e,0x80,0xbe] +0xc1,0x4e,0x80,0xbe + +# GFX12: s_barrier_signal m0 ; encoding: [0x7d,0x4e,0x80,0xbe] +0x7d,0x4e,0x80,0xbe + +# GFX12: s_barrier_signal_isfirst -2 ; encoding: [0xc2,0x4f,0x80,0xbe] +0xc2,0x4f,0x80,0xbe + +# GFX12: s_barrier_signal_isfirst -1 ; encoding: [0xc1,0x4f,0x80,0xbe] +0xc1,0x4f,0x80,0xbe + +# GFX12: s_barrier_signal_isfirst m0 ; encoding: [0x7d,0x4f,0x80,0xbe] +0x7d,0x4f,0x80,0xbe + +# GFX12: s_barrier_init -1 ; encoding: [0xc1,0x51,0x80,0xbe] +0xc1,0x51,0x80,0xbe + +# GFX12: s_barrier_init -2 ; encoding: [0xc2,0x51,0x80,0xbe] +0xc2,0x51,0x80,0xbe + +# GFX12: s_barrier_init m0 ; encoding: [0x7d,0x51,0x80,0xbe] +0x7d,0x51,0x80,0xbe + +# GFX12: s_barrier_join -1 ; encoding: [0xc1,0x52,0x80,0xbe] +0xc1,0x52,0x80,0xbe + +# GFX12: s_barrier_join -2 ; encoding: [0xc2,0x52,0x80,0xbe] +0xc2,0x52,0x80,0xbe + +# GFX12: s_barrier_join m0 ; encoding: [0x7d,0x52,0x80,0xbe] +0x7d,0x52,0x80,0xbe + +# GFX12: s_wakeup_barrier 1 ; encoding: [0x81,0x57,0x80,0xbe] +0x81,0x57,0x80,0xbe + +# GFX12: s_wakeup_barrier -1 ; encoding: [0xc1,0x57,0x80,0xbe] +0xc1,0x57,0x80,0xbe + +# GFX12: s_wakeup_barrier m0 ; encoding: [0x7d,0x57,0x80,0xbe] +0x7d,0x57,0x80,0xbe + +# GFX12: s_get_barrier_state s3, -1 ; encoding: [0xc1,0x50,0x83,0xbe] +0xc1,0x50,0x83,0xbe + +# GFX12: s_get_barrier_state s3, -2 ; encoding: [0xc2,0x50,0x83,0xbe] +0xc2,0x50,0x83,0xbe + +# GFX12: s_get_barrier_state s3, m0 ; encoding: [0x7d,0x50,0x83,0xbe] +0x7d,0x50,0x83,0xbe + # GFX12: s_abs_i32 exec_hi, s1 ; encoding: [0x01,0x15,0xff,0xbe] 0x01,0x15,0xff,0xbe diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopp.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopp.txt index fe74ff08a8e5c..13ded15998fb2 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopp.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopp.txt @@ -15,6 +15,18 @@ # GFX12: s_singleuse_vdst 0x1234 ; encoding: [0x34,0x12,0x93,0xbf] 0x34,0x12,0x93,0xbf +# GFX12: s_barrier_wait 0xffff ; encoding: [0xff,0xff,0x94,0xbf] +0xff,0xff,0x94,0xbf + +# GFX12: s_barrier_wait 1 ; encoding: [0x01,0x00,0x94,0xbf] +0x01,0x00,0x94,0xbf + +# GFX12: s_barrier_leave ; encoding: [0x00,0x00,0x95,0xbf] +0x00,0x00,0x95,0xbf + +# GFX12: s_barrier ; encoding: [0x00,0x00,0xbd,0xbf] +0x00,0x00,0xbd,0xbf + # GFX12: s_branch 0 ; encoding: [0x00,0x00,0xa0,0xbf] 0x00,0x00,0xa0,0xbf