From dca720fcc34c082c8c7f16670d2edb26f617496b Mon Sep 17 00:00:00 2001 From: Patrick Simmons Date: Fri, 22 Aug 2025 00:14:03 -0500 Subject: [PATCH 1/7] Attempt to add inline asm to sched group barriers --- llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp index dbe74b1b08f8c..9689773872a6e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp @@ -75,8 +75,9 @@ enum class SchedGroupMask { DS_READ = 1u << 8, DS_WRITE = 1u << 9, TRANS = 1u << 10, + INLINE_ASM = 1u << 11, ALL = ALU | VALU | SALU | MFMA | VMEM | VMEM_READ | VMEM_WRITE | DS | - DS_READ | DS_WRITE | TRANS, + DS_READ | DS_WRITE | TRANS | INLINE_ASM, LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL) }; @@ -2436,6 +2437,10 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const { TII->isTRANS(MI)) Result = true; + else if (((SGMask & SchedGroupMask::INLINE_ASM) != SchedGroupMask::NONE) && + MI.isInlineAsm()) + Result = true; + LLVM_DEBUG( dbgs() << "For SchedGroup with mask " << format_hex((int)SGMask, 10, true) << (Result ? " could classify " : " unable to classify ") << MI); From 88abb6ec6bd6a85a5da0ee5f23f7bd7fa1e4b29d Mon Sep 17 00:00:00 2001 From: Patrick Simmons Date: Tue, 26 Aug 2025 15:23:02 -0500 Subject: [PATCH 2/7] Allow specifying sched group barrier masks for inline asm --- llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp index 9689773872a6e..8c514714bd7dd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp @@ -75,9 +75,8 @@ enum class SchedGroupMask { DS_READ = 1u << 8, DS_WRITE = 1u << 9, TRANS = 1u << 10, - INLINE_ASM = 1u << 11, ALL = ALU | VALU | SALU | MFMA | VMEM | VMEM_READ | VMEM_WRITE | DS | - DS_READ | DS_WRITE | TRANS | INLINE_ASM, + DS_READ | DS_WRITE | TRANS, LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL) }; @@ -2392,6 +2391,16 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const { if (MI.isMetaInstruction()) Result = false; + else if (MI.isInlineAsm()) { + std::string Text = MI.getOperand(0).getSymbolName(); + if (Text.find("SGMASK:") != std::string::npos) { + Text = Text.substr(Text.find("SGMASK:") + strlen("SGMASK:")); + Text = Text.substr(0, Text.find_first_of(" \t\r\n")); + unsigned long InlineAsmMask = std::stoul(Text, nullptr, 0); + Result = ((unsigned long)SGMask & InlineAsmMask) != 0; + } + } + else if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) && (TII->isVALU(MI) || TII->isMFMAorWMMA(MI) || TII->isSALU(MI) || TII->isTRANS(MI))) @@ -2437,10 +2446,6 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const { TII->isTRANS(MI)) Result = true; - else if (((SGMask & SchedGroupMask::INLINE_ASM) != SchedGroupMask::NONE) && - MI.isInlineAsm()) - Result = true; - LLVM_DEBUG( dbgs() << "For SchedGroup with mask " << format_hex((int)SGMask, 10, true) << (Result ? " could classify " : " unable to classify ") << MI); From 35b281b89626f20a0c313d87b5269facf9e33db4 Mon Sep 17 00:00:00 2001 From: Patrick Simmons Date: Thu, 28 Aug 2025 11:17:43 -0500 Subject: [PATCH 3/7] Switch to StringRef --- llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp index 8c514714bd7dd..fedc1e73b4123 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp @@ -2392,11 +2392,11 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const { Result = false; else if (MI.isInlineAsm()) { - std::string Text = MI.getOperand(0).getSymbolName(); + StringRef Text = MI.getOperand(0).getSymbolName(); if (Text.find("SGMASK:") != std::string::npos) { Text = Text.substr(Text.find("SGMASK:") + strlen("SGMASK:")); Text = Text.substr(0, Text.find_first_of(" \t\r\n")); - unsigned long InlineAsmMask = std::stoul(Text, nullptr, 0); + unsigned long InlineAsmMask = std::stoul(Text.str(), nullptr, 0); Result = ((unsigned long)SGMask & InlineAsmMask) != 0; } } From a3f52ee0eb752286f7cf6f16b787cb99d8625180 Mon Sep 17 00:00:00 2001 From: Patrick Simmons Date: Fri, 29 Aug 2025 17:35:22 -0500 Subject: [PATCH 4/7] Add testcase --- llvm/test/CodeGen/AMDGPU/inlineasm-sgmask.ll | 71 ++++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/inlineasm-sgmask.ll diff --git a/llvm/test/CodeGen/AMDGPU/inlineasm-sgmask.ll b/llvm/test/CodeGen/AMDGPU/inlineasm-sgmask.ll new file mode 100644 index 0000000000000..2f35640226fd6 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/inlineasm-sgmask.ll @@ -0,0 +1,71 @@ +; RUN: llc -O3 -mcpu=gfx942 < %s | FileCheck %s +; CHECK: v_add_f32_e32 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: v_add_f32_e32 +; CHECK-NEXT: ;;#ASMEND +; CHECK: v_add_f32_e32 +; ModuleID = '' +source_filename = "llvm-link" +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9" +target triple = "amdgcn-amd-amdhsa" + +@llvm.compiler.used = appending addrspace(1) global [1 x ptr] [ptr addrspacecast (ptr addrspace(1) @__hip_cuid_bffb86447932ec40 to ptr)], section "llvm.metadata" +@__hip_cuid_bffb86447932ec40 = addrspace(1) global i8 0 + +; Function Attrs: convergent mustprogress norecurse nounwind +define protected amdgpu_kernel void @_Z17group4_sum_floaatPfPKfi(ptr addrspace(1) noalias noundef writeonly captures(none) %to.coerce, ptr addrspace(1) noalias noundef readonly captures(none) %from.coerce, i32 noundef %length) local_unnamed_addr #0 { +entry: + %0 = tail call i32 @llvm.amdgcn.workgroup.id.x() + %mul = shl i32 %0, 6 + %1 = tail call i32 @llvm.amdgcn.workitem.id.x() + %add = add i32 %mul, %1 + %cmp = icmp slt i32 %add, %length + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %idx.ext = sext i32 %add to i64 + %add.ptr = getelementptr inbounds float, ptr addrspace(1) %to.coerce, i64 %idx.ext + %mul3 = shl nsw i32 %add, 2 + %idx.ext4 = sext i32 %mul3 to i64 + %add.ptr5 = getelementptr inbounds float, ptr addrspace(1) %from.coerce, i64 %idx.ext4 + %2 = load <4 x float>, ptr addrspace(1) %add.ptr5, align 16, !tbaa !0 + %3 = extractelement <4 x float> %2, i64 3 + %4 = extractelement <4 x float> %2, i64 0 + %5 = tail call contract noundef float asm "v_add_f32_e32 $0, $1, $2 ; SGMASK:0x1", "=v,v,v"(float %3, float %4) #3, !srcloc !3 + %6 = extractelement <4 x float> %2, i64 1 + %7 = extractelement <4 x float> %2, i64 2 + %add6 = fadd contract float %6, %7 + %add7 = fadd contract float %5, %add6 + store float %add7, ptr addrspace(1) %add.ptr, align 4, !tbaa !4 + tail call void @llvm.amdgcn.sched.group.barrier(i32 16, i32 1, i32 0) + tail call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 5, i32 0) + tail call void @llvm.amdgcn.sched.group.barrier(i32 16, i32 1, i32 0) + tail call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 1, i32 0) + tail call void @llvm.amdgcn.sched.group.barrier(i32 1, i32 1, i32 0) + tail call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 1, i32 0) + br label %if.end + +if.end: ; preds = %if.then, %entry + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef i32 @llvm.amdgcn.workgroup.id.x() #1 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x() #1 + +; Function Attrs: convergent nocallback nofree nounwind willreturn +declare void @llvm.amdgcn.sched.group.barrier(i32 immarg, i32 immarg, i32 immarg) #2 + +attributes #0 = { convergent mustprogress norecurse nounwind "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,1024" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx942" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nofree nounwind willreturn } +attributes #3 = { convergent nounwind memory(none) } + +!0 = !{!1, !1, i64 0} +!1 = !{!"omnipotent char", !2, i64 0} +!2 = !{!"Simple C++ TBAA"} +!3 = !{i64 129} +!4 = !{!5, !5, i64 0} +!5 = !{!"float", !1, i64 0} From a24521a8c8de26eaa39535a863b7db7843c82951 Mon Sep 17 00:00:00 2001 From: Patrick Simmons Date: Wed, 1 Oct 2025 15:25:02 -0500 Subject: [PATCH 5/7] Guess the constraints instead of using user-provided hints --- llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp | 43 ++++++++++++++++++++ llvm/test/CodeGen/AMDGPU/inlineasm-sgmask.ll | 17 ++++---- 2 files changed, 53 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp index fedc1e73b4123..5d352f16048f6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp @@ -2392,6 +2392,48 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const { Result = false; else if (MI.isInlineAsm()) { + auto &TRI = TII->getRegisterInfo(); + auto &MRI = MI.getParent()->getParent()->getRegInfo(); + bool SGPR_used = false, VGPR_used = false, VMFMA_used = false, + MayLoad = MI.mayLoad(), MayStore = MI.mayStore(); + for (const MachineOperand &Operand : MI.operands()) + if (Operand.isReg()) { + auto &RegClass = *TRI.getRegClassForOperandReg(MRI, Operand); + if (TRI.isVGPRClass(&RegClass)) + VGPR_used = true; + if (TRI.isAGPRClass(&RegClass) || TRI.getRegSizeInBits(RegClass) > 128) + VMFMA_used = true; + if (TRI.isSGPRClass(&RegClass)) + SGPR_used = true; + } + + unsigned long InlineAsmMask = 0; + if (VGPR_used && !VMFMA_used && !MayLoad && !MayStore) + InlineAsmMask |= (unsigned long)SchedGroupMask::VALU; + if (VMFMA_used) + InlineAsmMask |= (unsigned long)SchedGroupMask::MFMA; + if (VGPR_used && MayLoad) + InlineAsmMask |= (unsigned long)SchedGroupMask::VMEM_READ; + if (VGPR_used && MayStore) + InlineAsmMask |= (unsigned long)SchedGroupMask::VMEM_WRITE; + if (!VGPR_used && MayLoad) + InlineAsmMask |= (unsigned long)SchedGroupMask::DS_READ; + if (!VGPR_used && MayStore) + InlineAsmMask |= (unsigned long)SchedGroupMask::DS_WRITE; + if (InlineAsmMask & (unsigned long)SchedGroupMask::VALU || + InlineAsmMask & (unsigned long)SchedGroupMask::SALU) + InlineAsmMask |= (unsigned long)SchedGroupMask::ALU; + if (InlineAsmMask & (unsigned long)SchedGroupMask::DS_READ || + InlineAsmMask & (unsigned long)SchedGroupMask::DS_WRITE) + InlineAsmMask |= (unsigned long)SchedGroupMask::DS; + if (InlineAsmMask & (unsigned long)SchedGroupMask::VMEM_READ || + InlineAsmMask & (unsigned long)SchedGroupMask::VMEM_WRITE) + InlineAsmMask |= (unsigned long)SchedGroupMask::VMEM; + + Result = ((unsigned long)SGMask & InlineAsmMask) != 0; + + // Original implementation +#if 0 StringRef Text = MI.getOperand(0).getSymbolName(); if (Text.find("SGMASK:") != std::string::npos) { Text = Text.substr(Text.find("SGMASK:") + strlen("SGMASK:")); @@ -2399,6 +2441,7 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const { unsigned long InlineAsmMask = std::stoul(Text.str(), nullptr, 0); Result = ((unsigned long)SGMask & InlineAsmMask) != 0; } +#endif } else if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) && diff --git a/llvm/test/CodeGen/AMDGPU/inlineasm-sgmask.ll b/llvm/test/CodeGen/AMDGPU/inlineasm-sgmask.ll index 2f35640226fd6..89bf6ad677545 100644 --- a/llvm/test/CodeGen/AMDGPU/inlineasm-sgmask.ll +++ b/llvm/test/CodeGen/AMDGPU/inlineasm-sgmask.ll @@ -28,12 +28,15 @@ if.then: ; preds = %entry %mul3 = shl nsw i32 %add, 2 %idx.ext4 = sext i32 %mul3 to i64 %add.ptr5 = getelementptr inbounds float, ptr addrspace(1) %from.coerce, i64 %idx.ext4 - %2 = load <4 x float>, ptr addrspace(1) %add.ptr5, align 16, !tbaa !0 - %3 = extractelement <4 x float> %2, i64 3 - %4 = extractelement <4 x float> %2, i64 0 - %5 = tail call contract noundef float asm "v_add_f32_e32 $0, $1, $2 ; SGMASK:0x1", "=v,v,v"(float %3, float %4) #3, !srcloc !3 - %6 = extractelement <4 x float> %2, i64 1 - %7 = extractelement <4 x float> %2, i64 2 + %2 = load <2 x float>, ptr addrspace(1) %add.ptr5, align 16, !tbaa !0 + %a20 = add i64 %idx.ext4, 2 + %a21 = getelementptr inbounds float, ptr addrspace(1) %from.coerce, i64 %a20 + %a22 = load <2 x float>, ptr addrspace(1) %a21, align 16, !tbaa !0 + %3 = extractelement <2 x float> %a22, i64 1 + %4 = extractelement <2 x float> %2, i64 0 + %5 = tail call contract noundef float asm "v_mfma_f64_4x4x4f64 $0, $1, $2, 0", "=a,v,v"(<2 x float> %2, <2 x float> %a22) #3, !srcloc !3 + %6 = extractelement <2 x float> %2, i64 1 + %7 = extractelement <2 x float> %a22, i64 0 %add6 = fadd contract float %6, %7 %add7 = fadd contract float %5, %add6 store float %add7, ptr addrspace(1) %add.ptr, align 4, !tbaa !4 @@ -41,7 +44,7 @@ if.then: ; preds = %entry tail call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 5, i32 0) tail call void @llvm.amdgcn.sched.group.barrier(i32 16, i32 1, i32 0) tail call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 1, i32 0) - tail call void @llvm.amdgcn.sched.group.barrier(i32 1, i32 1, i32 0) + tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) tail call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 1, i32 0) br label %if.end From 9604498d0bdb93a23fb1df16420484d9dc361641 Mon Sep 17 00:00:00 2001 From: Patrick Simmons Date: Tue, 7 Oct 2025 11:55:04 -0500 Subject: [PATCH 6/7] Fix error-that-should-be-warning, code may or may not be correct --- llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp index 5d352f16048f6..02f119845680a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp @@ -2410,6 +2410,8 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const { unsigned long InlineAsmMask = 0; if (VGPR_used && !VMFMA_used && !MayLoad && !MayStore) InlineAsmMask |= (unsigned long)SchedGroupMask::VALU; + if (SGPR_used && !MayLoad && !MayStore) //arsenm: should this have !VGPR_used? + InlineAsmMask |= (unsigned long)SchedGroupMask::SALU; if (VMFMA_used) InlineAsmMask |= (unsigned long)SchedGroupMask::MFMA; if (VGPR_used && MayLoad) From aac0dd47c299df26d8f75bac1c5798b01843ae75 Mon Sep 17 00:00:00 2001 From: Patrick Simmons Date: Tue, 7 Oct 2025 12:52:43 -0500 Subject: [PATCH 7/7] Update testcase; update algorithm to something maybe right --- llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp | 4 ++-- llvm/test/CodeGen/AMDGPU/inlineasm-sgmask.ll | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp index 02f119845680a..e59374658e69e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp @@ -2408,9 +2408,9 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const { } unsigned long InlineAsmMask = 0; - if (VGPR_used && !VMFMA_used && !MayLoad && !MayStore) + if (VGPR_used && !SGPR_used && !VMFMA_used && !MayLoad && !MayStore) InlineAsmMask |= (unsigned long)SchedGroupMask::VALU; - if (SGPR_used && !MayLoad && !MayStore) //arsenm: should this have !VGPR_used? + if (SGPR_used && !MayLoad && !MayStore) InlineAsmMask |= (unsigned long)SchedGroupMask::SALU; if (VMFMA_used) InlineAsmMask |= (unsigned long)SchedGroupMask::MFMA; diff --git a/llvm/test/CodeGen/AMDGPU/inlineasm-sgmask.ll b/llvm/test/CodeGen/AMDGPU/inlineasm-sgmask.ll index 89bf6ad677545..402b1408284d5 100644 --- a/llvm/test/CodeGen/AMDGPU/inlineasm-sgmask.ll +++ b/llvm/test/CodeGen/AMDGPU/inlineasm-sgmask.ll @@ -1,7 +1,7 @@ ; RUN: llc -O3 -mcpu=gfx942 < %s | FileCheck %s ; CHECK: v_add_f32_e32 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: v_add_f32_e32 +; CHECK-NEXT: v_mfma_f64 ; CHECK-NEXT: ;;#ASMEND ; CHECK: v_add_f32_e32 ; ModuleID = ''