Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions clang/include/clang/Basic/BuiltinsAMDGPU.def
Original file line number Diff line number Diff line change
Expand Up @@ -749,6 +749,7 @@ TARGET_BUILTIN(__builtin_amdgcn_ds_load_tr16_b128_v8bf16, "V8yV8y*3", "nc", "gfx

TARGET_BUILTIN(__builtin_amdgcn_s_setprio_inc_wg, "vIs", "n", "setprio-inc-wg-inst")
TARGET_BUILTIN(__builtin_amdgcn_s_monitor_sleep, "vIs", "n", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_s_wakeup_barrier, "vv*", "n", "s-wakeup-barrier-inst")

TARGET_BUILTIN(__builtin_amdgcn_s_wait_asynccnt, "vIUs", "n", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_s_wait_tensorcnt, "vIUs", "n", "gfx1250-insts")
Expand Down
4 changes: 2 additions & 2 deletions clang/test/CodeGenOpenCL/amdgpu-cluster-dims.cl
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ kernel void foo(global int *p) { *p = 1; }
// CHECK-NEXT: ret void
//
//.
// CHECK: attributes #[[ATTR0]] = { convergent norecurse nounwind "amdgpu-cluster-dims"="0,0,0" "amdgpu-flat-work-group-size"="1,256" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1250" "target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+clusters,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+permlane16-swap,+pk-add-min-max-insts,+prng-inst,+qsad-insts,+sad-insts,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32" "uniform-work-group-size"="false" }
// CHECK: attributes #[[ATTR1]] = { alwaysinline convergent norecurse nounwind "amdgpu-cluster-dims"="0,0,0" "amdgpu-flat-work-group-size"="1,256" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1250" "target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+clusters,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+permlane16-swap,+pk-add-min-max-insts,+prng-inst,+qsad-insts,+sad-insts,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32" }
// CHECK: attributes #[[ATTR0]] = { convergent norecurse nounwind "amdgpu-cluster-dims"="0,0,0" "amdgpu-flat-work-group-size"="1,256" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1250" "target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+clusters,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+permlane16-swap,+pk-add-min-max-insts,+prng-inst,+qsad-insts,+s-wakeup-barrier-inst,+sad-insts,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32" "uniform-work-group-size"="false" }
// CHECK: attributes #[[ATTR1]] = { alwaysinline convergent norecurse nounwind "amdgpu-cluster-dims"="0,0,0" "amdgpu-flat-work-group-size"="1,256" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1250" "target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+clusters,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+permlane16-swap,+pk-add-min-max-insts,+prng-inst,+qsad-insts,+s-wakeup-barrier-inst,+sad-insts,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32" }
// CHECK: attributes #[[ATTR2]] = { convergent nounwind }
//.
// CHECK: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600}
Expand Down
4 changes: 2 additions & 2 deletions clang/test/CodeGenOpenCL/amdgpu-features.cl
Original file line number Diff line number Diff line change
Expand Up @@ -109,8 +109,8 @@
// GFX1153: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f32,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+sad-insts,+wavefrontsize32"
// GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-global-pk-add-bf16-inst,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+sad-insts,+wavefrontsize32"
// GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-global-pk-add-bf16-inst,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+sad-insts,+wavefrontsize32"
// GFX1250: "target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+clusters,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+permlane16-swap,+pk-add-min-max-insts,+prng-inst,+qsad-insts,+sad-insts,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32"
// GFX1251: "target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+clusters,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+permlane16-swap,+pk-add-min-max-insts,+prng-inst,+qsad-insts,+sad-insts,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32"
// GFX1250: "target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+clusters,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+permlane16-swap,+pk-add-min-max-insts,+prng-inst,+qsad-insts,+s-wakeup-barrier-inst,+sad-insts,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32"
// GFX1251: "target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+clusters,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+permlane16-swap,+pk-add-min-max-insts,+prng-inst,+qsad-insts,+s-wakeup-barrier-inst,+sad-insts,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32"

// GFX1103-W64: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f32,+ci-insts,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+lerp-inst,+qsad-insts,+sad-insts,+wavefrontsize64"

Expand Down
15 changes: 15 additions & 0 deletions clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
Original file line number Diff line number Diff line change
Expand Up @@ -1489,6 +1489,21 @@ void test_s_cluster_barrier()
__builtin_amdgcn_s_cluster_barrier();
}

// CHECK-LABEL: @test_s_wakeup_barrier(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[BAR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[BAR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BAR_ADDR]] to ptr
// CHECK-NEXT: store ptr [[BAR:%.*]], ptr [[BAR_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[BAR_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(3)
// CHECK-NEXT: call void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3) [[TMP1]])
// CHECK-NEXT: ret void
//
void test_s_wakeup_barrier(void *bar)
{
__builtin_amdgcn_s_wakeup_barrier(bar);
}

// CHECK-LABEL: @test_global_add_f32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[RETVAL:%.*]] = alloca float, align 4, addrspace(5)
Expand Down
6 changes: 6 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -314,6 +314,12 @@ def int_amdgcn_s_barrier_join : ClangBuiltin<"__builtin_amdgcn_s_barrier_join">,
Intrinsic<[], [local_ptr_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
IntrNoCallback, IntrNoFree]>;

// void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3) %barrier)
// The %barrier argument must be uniform, otherwise behavior is undefined.
def int_amdgcn_s_wakeup_barrier : ClangBuiltin<"__builtin_amdgcn_s_wakeup_barrier">,
Intrinsic<[], [local_ptr_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
IntrNoCallback, IntrNoFree]>;

// void @llvm.amdgcn.s.barrier.wait(i16 %barrierType)
def int_amdgcn_s_barrier_wait : ClangBuiltin<"__builtin_amdgcn_s_barrier_wait">,
Intrinsic<[], [llvm_i16_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent,
Expand Down
10 changes: 10 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -1276,6 +1276,12 @@ def FeatureSetPrioIncWgInst : SubtargetFeature<"setprio-inc-wg-inst",
"Has s_setprio_inc_wg instruction."
>;

def FeatureSWakeupBarrier : SubtargetFeature<"s-wakeup-barrier-inst",
"HasSWakeupBarrier",
"true",
"Has s_wakeup_barrier instruction."
>;

//===------------------------------------------------------------===//
// Subtarget Features (options and debugging)
//===------------------------------------------------------------===//
Expand Down Expand Up @@ -2200,6 +2206,7 @@ def FeatureISAVersion12_50_Common : FeatureSet<
FeaturePkAddMinMaxInsts,
FeatureLdsBarrierArriveAtomic,
FeatureSetPrioIncWgInst,
FeatureSWakeupBarrier,
Feature45BitNumRecordsBufferResource,
FeatureSupportsXNACK,
FeatureXNACK,
Expand Down Expand Up @@ -3076,6 +3083,9 @@ def HasLdsBarrierArriveAtomic : Predicate<"Subtarget->hasLdsBarrierArriveAtomic(
def HasSetPrioIncWgInst : Predicate<"Subtarget->hasSetPrioIncWgInst()">,
AssemblerPredicate<(all_of FeatureSetPrioIncWgInst)>;

def HasSWakeupBarrier : Predicate<"Subtarget->hasSWakeupBarrier()">,
AssemblerPredicate<(all_of FeatureSWakeupBarrier)>;

def NeedsAlignedVGPRs : Predicate<"Subtarget->needsAlignedVGPRs()">,
AssemblerPredicate<(all_of FeatureRequiresAlignedVGPRs)>;

Expand Down
14 changes: 14 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2392,6 +2392,16 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
case Intrinsic::amdgcn_s_barrier_init:
case Intrinsic::amdgcn_s_barrier_signal_var:
return selectNamedBarrierInit(I, IntrinsicID);
case Intrinsic::amdgcn_s_wakeup_barrier: {
if (!STI.hasSWakeupBarrier()) {
Function &F = I.getMF()->getFunction();
F.getContext().diagnose(
DiagnosticInfoUnsupported(F, "intrinsic not supported on subtarget",
I.getDebugLoc(), DS_Error));
return false;
}
return selectNamedBarrierInst(I, IntrinsicID);
}
case Intrinsic::amdgcn_s_barrier_join:
case Intrinsic::amdgcn_s_get_named_barrier_state:
return selectNamedBarrierInst(I, IntrinsicID);
Expand Down Expand Up @@ -6832,6 +6842,8 @@ unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) {
llvm_unreachable("not a named barrier op");
case Intrinsic::amdgcn_s_barrier_join:
return AMDGPU::S_BARRIER_JOIN_IMM;
case Intrinsic::amdgcn_s_wakeup_barrier:
return AMDGPU::S_WAKEUP_BARRIER_IMM;
case Intrinsic::amdgcn_s_get_named_barrier_state:
return AMDGPU::S_GET_BARRIER_STATE_IMM;
};
Expand All @@ -6841,6 +6853,8 @@ unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) {
llvm_unreachable("not a named barrier op");
case Intrinsic::amdgcn_s_barrier_join:
return AMDGPU::S_BARRIER_JOIN_M0;
case Intrinsic::amdgcn_s_wakeup_barrier:
return AMDGPU::S_WAKEUP_BARRIER_M0;
case Intrinsic::amdgcn_s_get_named_barrier_state:
return AMDGPU::S_GET_BARRIER_STATE_M0;
};
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -371,6 +371,7 @@ bool isReallyAClobber(const Value *Ptr, MemoryDef *Def, AAResults *AA) {
case Intrinsic::amdgcn_s_barrier_wait:
case Intrinsic::amdgcn_s_barrier_leave:
case Intrinsic::amdgcn_s_get_barrier_state:
case Intrinsic::amdgcn_s_wakeup_barrier:
case Intrinsic::amdgcn_wave_barrier:
case Intrinsic::amdgcn_sched_barrier:
case Intrinsic::amdgcn_sched_group_barrier:
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3347,6 +3347,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
constrainOpWithReadfirstlane(B, MI, 1);
return;
case Intrinsic::amdgcn_s_barrier_join:
case Intrinsic::amdgcn_s_wakeup_barrier:
constrainOpWithReadfirstlane(B, MI, 1);
return;
case Intrinsic::amdgcn_s_barrier_init:
Expand Down Expand Up @@ -5583,6 +5584,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
break;
case Intrinsic::amdgcn_s_barrier_join:
case Intrinsic::amdgcn_s_wakeup_barrier:
OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
break;
case Intrinsic::amdgcn_s_barrier_init:
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/AMDGPU/GCNSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -289,6 +289,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool HasPointSampleAccel = false;
bool HasLdsBarrierArriveAtomic = false;
bool HasSetPrioIncWgInst = false;
bool HasSWakeupBarrier = false;

bool RequiresCOV6 = false;
bool UseBlockVGPROpsForCSR = false;
Expand Down Expand Up @@ -1612,6 +1613,9 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
// \returns true if target has S_SETPRIO_INC_WG instruction.
bool hasSetPrioIncWgInst() const { return HasSetPrioIncWgInst; }

// \returns true if target has S_WAKEUP_BARRIER instruction.
bool hasSWakeupBarrier() const { return HasSWakeupBarrier; }

// \returns true if S_GETPC_B64 zero-extends the result from 48 bits instead
// of sign-extending. Note that GFX1250 has not only fixed the bug but also
// extended VA to 57 bits.
Expand Down
29 changes: 25 additions & 4 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11527,6 +11527,11 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
return SDValue(NewMI, 0);
}
case Intrinsic::amdgcn_s_wakeup_barrier: {
if (!Subtarget->hasSWakeupBarrier())
return SDValue();
[[fallthrough]];
}
case Intrinsic::amdgcn_s_barrier_join: {
// these three intrinsics have one operand: barrier pointer
SDValue Chain = Op->getOperand(0);
Expand All @@ -11536,16 +11541,32 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,

if (isa<ConstantSDNode>(BarOp)) {
uint64_t BarVal = cast<ConstantSDNode>(BarOp)->getZExtValue();
Opc = AMDGPU::S_BARRIER_JOIN_IMM;

switch (IntrinsicID) {
default:
return SDValue();
case Intrinsic::amdgcn_s_barrier_join:
Opc = AMDGPU::S_BARRIER_JOIN_IMM;
break;
case Intrinsic::amdgcn_s_wakeup_barrier:
Opc = AMDGPU::S_WAKEUP_BARRIER_IMM;
break;
}
// extract the BarrierID from bits 4-9 of the immediate
unsigned BarID = (BarVal >> 4) & 0x3F;
SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
Ops.push_back(K);
Ops.push_back(Chain);
} else {
Opc = AMDGPU::S_BARRIER_JOIN_M0;

switch (IntrinsicID) {
default:
return SDValue();
case Intrinsic::amdgcn_s_barrier_join:
Opc = AMDGPU::S_BARRIER_JOIN_M0;
break;
case Intrinsic::amdgcn_s_wakeup_barrier:
Opc = AMDGPU::S_WAKEUP_BARRIER_M0;
break;
}
// extract the BarrierID from bits 4-9 of BarOp, copy to M0[5:0]
SDValue M0Val;
M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
Expand Down
14 changes: 14 additions & 0 deletions llvm/lib/Target/AMDGPU/SOPInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -504,6 +504,12 @@ def S_BARRIER_JOIN_M0 : SOP1_Pseudo <"s_barrier_join m0", (outs), (ins),
let isConvergent = 1;
}

def S_WAKEUP_BARRIER_M0 : SOP1_Pseudo <"s_wakeup_barrier m0", (outs), (ins),
"", []>{
let SchedRW = [WriteBarrier];
let isConvergent = 1;
let SubtargetPredicate = HasSWakeupBarrier;
}
} // End Uses = [M0]

def S_BARRIER_SIGNAL_IMM : SOP1_Pseudo <"s_barrier_signal", (outs),
Expand All @@ -527,6 +533,12 @@ def S_BARRIER_JOIN_IMM : SOP1_Pseudo <"s_barrier_join", (outs),
let isConvergent = 1;
}

def S_WAKEUP_BARRIER_IMM : SOP1_Pseudo <"s_wakeup_barrier", (outs),
(ins SplitBarrier:$src0), "$src0", []>{
let SchedRW = [WriteBarrier];
let isConvergent = 1;
let SubtargetPredicate = HasSWakeupBarrier;
}
} // End has_sdst = 0

def S_GET_BARRIER_STATE_IMM : SOP1_Pseudo <"s_get_barrier_state", (outs SSrc_b32:$sdst),
Expand Down Expand Up @@ -2226,6 +2238,8 @@ defm S_SLEEP_VAR : SOP1_IMM_Real_gfx12<0x058>;
// GFX1250
defm S_GET_SHADER_CYCLES_U64 : SOP1_Real_gfx12<0x06>;
defm S_ADD_PC_I64 : SOP1_Real_gfx12<0x04b>;
defm S_WAKEUP_BARRIER_M0 : SOP1_M0_Real_gfx12<0x057>;
defm S_WAKEUP_BARRIER_IMM : SOP1_IMM_Real_gfx12<0x057>;

//===----------------------------------------------------------------------===//
// SOP1 - GFX1150, GFX12
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/TargetParser/TargetParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -443,6 +443,7 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const Triple &T,
Features["atomic-global-pk-add-bf16-inst"] = true;
Features["atomic-ds-pk-add-16-insts"] = true;
Features["setprio-inc-wg-inst"] = true;
Features["s-wakeup-barrier-inst"] = true;
Features["atomic-fmin-fmax-global-f32"] = true;
Features["atomic-fmin-fmax-global-f64"] = true;
Features["wavefrontsize32"] = true;
Expand Down
Loading