-
Notifications
You must be signed in to change notification settings - Fork 15.2k
AMDGPU: Figure out required AGPR count for inline asm #150910
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
This stack of pull requests is managed by Graphite. Learn more about stacking. |
@llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesFor now just try to compute the minimum number of AGPRs required Full diff: https://github.com/llvm/llvm-project/pull/150910.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 49d8b4447adfd..f0dbd81c874fd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -1201,16 +1201,61 @@ AAAMDWavesPerEU &AAAMDWavesPerEU::createForPosition(const IRPosition &IRP,
llvm_unreachable("AAAMDWavesPerEU is only valid for function position");
}
-static bool inlineAsmUsesAGPRs(const InlineAsm *IA) {
- for (const auto &CI : IA->ParseConstraints()) {
+/// Compute the minimum number of AGPRs required to allocate the inline asm.
+static unsigned inlineAsmGetNumRequiredAGPRs(const InlineAsm *IA,
+ const CallBase &Call) {
+ unsigned ArgNo = 0;
+ unsigned ResNo = 0;
+ unsigned AGPRDefCount = 0;
+ unsigned AGPRUseCount = 0;
+ unsigned MaxPhysReg = 0;
+ const DataLayout &DL = Call.getFunction()->getParent()->getDataLayout();
+
+ for (const InlineAsm::ConstraintInfo &CI : IA->ParseConstraints()) {
+ Type *Ty = nullptr;
+ switch (CI.Type) {
+ case InlineAsm::isOutput: {
+ Ty = Call.getType();
+ if (auto *STy = dyn_cast<StructType>(Ty))
+ Ty = STy->getElementType(ResNo);
+ ++ResNo;
+ break;
+ }
+ case InlineAsm::isInput: {
+ Ty = Call.getArgOperand(ArgNo++)->getType();
+ break;
+ }
+ case InlineAsm::isLabel:
+ continue;
+ case InlineAsm::isClobber:
+ // Parse the physical register reference.
+ break;
+ }
+
for (StringRef Code : CI.Codes) {
- Code.consume_front("{");
- if (Code.starts_with("a"))
- return true;
+ if (Code.starts_with("a")) {
+ // Virtual register, compute number of registers based on the type.
+ //
+ // We ought to be going through TargetLowering to get the number of
+ // registers, but we should avoid the dependence on CodeGen here.
+ unsigned RegCount = divideCeil(DL.getTypeSizeInBits(Ty), 32);
+ if (CI.Type == InlineAsm::isOutput) {
+ AGPRDefCount += RegCount;
+ if (CI.isEarlyClobber)
+ AGPRUseCount += RegCount;
+ } else
+ AGPRUseCount += RegCount;
+ } else {
+ // Physical register reference
+ auto [Kind, RegIdx, NumRegs] = AMDGPU::parseAsmConstraintPhysReg(Code);
+ if (Kind == 'a')
+ MaxPhysReg = std::max(MaxPhysReg, std::min(RegIdx + NumRegs, 256u));
+ }
}
}
- return false;
+ unsigned MaxVirtReg = std::max(AGPRUseCount, AGPRDefCount);
+ return std::min(MaxVirtReg + MaxPhysReg, 256u);
}
// TODO: Migrate to range merge of amdgpu-agpr-alloc.
@@ -1252,7 +1297,7 @@ struct AAAMDGPUNoAGPR
const Function *Callee = dyn_cast<Function>(CalleeOp);
if (!Callee) {
if (const InlineAsm *IA = dyn_cast<InlineAsm>(CalleeOp))
- return !inlineAsmUsesAGPRs(IA);
+ return inlineAsmGetNumRequiredAGPRs(IA, CB) == 0;
return false;
}
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll
index 181dab8d4ca79..e502995cdb8ea 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll
@@ -251,6 +251,205 @@ define amdgpu_kernel void @indirect_calls_none_agpr(i1 %cond) {
ret void
}
+define amdgpu_kernel void @kernel_uses_asm_virtreg_def_struct_0() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_def_struct_0(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: [[DEF:%.*]] = call { i32, i32 } asm sideeffect "
+; CHECK-NEXT: ret void
+;
+ %def = call {i32, i32} asm sideeffect "; def $0", "=a,=a"()
+ ret void
+}
+
+define amdgpu_kernel void @kernel_uses_asm_virtreg_use_struct_1() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_use_struct_1(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: [[DEF:%.*]] = call { i32, <2 x i32> } asm sideeffect "
+; CHECK-NEXT: ret void
+;
+ %def = call {i32, <2 x i32>} asm sideeffect "; def $0", "=a,=a"()
+ ret void
+}
+
+define amdgpu_kernel void @kernel_uses_asm_virtreg_use_struct_2() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_use_struct_2(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: [[DEF:%.*]] = call { i32, <2 x i32> } asm sideeffect "
+; CHECK-NEXT: ret void
+;
+ %def = call {i32, <2 x i32>} asm sideeffect "; def $0", "=a,=v"()
+ ret void
+}
+
+define amdgpu_kernel void @kernel_uses_asm_virtreg_ptr_ty() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_ptr_ty(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: call void asm sideeffect "
+; CHECK-NEXT: ret void
+;
+ call void asm sideeffect "; use $0", "a"(ptr poison)
+ ret void
+}
+
+define amdgpu_kernel void @kernel_uses_asm_virtreg_def_ptr_ty() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_def_ptr_ty(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: [[DEF:%.*]] = call ptr asm sideeffect "
+; CHECK-NEXT: ret void
+;
+ %def = call ptr asm sideeffect "; def $0", "=a"()
+ ret void
+}
+
+define amdgpu_kernel void @kernel_uses_asm_virtreg_def_vector_ptr_ty() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_def_vector_ptr_ty(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: [[DEF:%.*]] = call <2 x ptr> asm sideeffect "
+; CHECK-NEXT: ret void
+;
+ %def = call <2 x ptr> asm sideeffect "; def $0", "=a"()
+ ret void
+}
+
+define amdgpu_kernel void @kernel_uses_asm_physreg_def_struct_0() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_physreg_def_struct_0(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: [[DEF:%.*]] = call { i32, i32 } asm sideeffect "
+; CHECK-NEXT: ret void
+;
+ %def = call {i32, i32} asm sideeffect "; def $0", "={a0},={a[4:5]}"()
+ ret void
+}
+
+define amdgpu_kernel void @kernel_uses_asm_clobber() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_clobber(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: call void asm sideeffect "
+; CHECK-NEXT: ret void
+;
+ call void asm sideeffect "; clobber $0", "~{a4}"()
+ ret void
+}
+
+define amdgpu_kernel void @kernel_uses_asm_clobber_tuple() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_clobber_tuple(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: call void asm sideeffect "
+; CHECK-NEXT: ret void
+;
+ call void asm sideeffect "; clobber $0", "~{a[10:13]}"()
+ ret void
+}
+
+define amdgpu_kernel void @kernel_uses_asm_clobber_oob() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_clobber_oob(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: call void asm sideeffect "
+; CHECK-NEXT: ret void
+;
+ call void asm sideeffect "; clobber $0", "~{a256}"()
+ ret void
+}
+
+define amdgpu_kernel void @kernel_uses_asm_clobber_max() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_clobber_max(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: call void asm sideeffect "
+; CHECK-NEXT: ret void
+;
+ call void asm sideeffect "; clobber $0", "~{a255}"()
+ ret void
+}
+
+define amdgpu_kernel void @kernel_uses_asm_physreg_oob() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_physreg_oob(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: call void asm sideeffect "
+; CHECK-NEXT: ret void
+;
+ call void asm sideeffect "; use $0", "{a256}"(i32 poison)
+ ret void
+}
+
+define amdgpu_kernel void @kernel_uses_asm_virtreg_def_max_ty() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_def_max_ty(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: [[DEF:%.*]] = call <32 x i32> asm sideeffect "
+; CHECK-NEXT: ret void
+;
+ %def = call <32 x i32> asm sideeffect "; def $0", "=a"()
+ ret void
+}
+
+define amdgpu_kernel void @kernel_uses_asm_virtreg_use_max_ty() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_use_max_ty(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: call void asm sideeffect "
+; CHECK-NEXT: ret void
+;
+ call void asm sideeffect "; use $0", "a"(<32 x i32> poison)
+ ret void
+}
+
+define amdgpu_kernel void @kernel_uses_asm_virtreg_use_def_max_ty() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_use_def_max_ty(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: [[DEF:%.*]] = call <32 x i32> asm sideeffect "
+; CHECK-NEXT: ret void
+;
+ %def = call <32 x i32> asm sideeffect "; use $0", "=a,a"(<32 x i32> poison)
+ ret void
+}
+
+define amdgpu_kernel void @vreg_use_exceeds_register_file() {
+; CHECK-LABEL: define amdgpu_kernel void @vreg_use_exceeds_register_file(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: call void asm sideeffect "
+; CHECK-NEXT: ret void
+;
+ call void asm sideeffect "; use $0", "a"(<257 x i32> poison)
+ ret void
+}
+
+define amdgpu_kernel void @vreg_def_exceeds_register_file() {
+; CHECK-LABEL: define amdgpu_kernel void @vreg_def_exceeds_register_file(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: [[DEF:%.*]] = call <257 x i32> asm sideeffect "
+; CHECK-NEXT: ret void
+;
+ %def = call <257 x i32> asm sideeffect "; def $0", "=a"()
+ ret void
+}
+
+define amdgpu_kernel void @multiple() {
+; CHECK-LABEL: define amdgpu_kernel void @multiple(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: [[DEF:%.*]] = call { <16 x i32>, <8 x i32>, <8 x i32> } asm sideeffect "
+; CHECK-NEXT: ret void
+;
+ %def = call {<16 x i32>, <8 x i32>, <8 x i32>} asm sideeffect "; def $0", "=a,=a,=a,a,a,a"(<4 x i32> splat (i32 0), <8 x i32> splat (i32 1), i64 999)
+ ret void
+}
+
+define amdgpu_kernel void @earlyclobber_0() {
+; CHECK-LABEL: define amdgpu_kernel void @earlyclobber_0(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: [[DEF:%.*]] = call <8 x i32> asm sideeffect "
+; CHECK-NEXT: ret void
+;
+ %def = call <8 x i32> asm sideeffect "; def $0", "=&a,a"(i32 0)
+ ret void
+}
+
+define amdgpu_kernel void @earlyclobber_1() {
+; CHECK-LABEL: define amdgpu_kernel void @earlyclobber_1(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: [[DEF:%.*]] = call { <8 x i32>, <16 x i32> } asm sideeffect "
+; CHECK-NEXT: ret void
+;
+ %def = call { <8 x i32>, <16 x i32 > } asm sideeffect "; def $0, $1", "=&a,=&a,a,a"(i32 0, <16 x i32> splat (i32 1))
+ ret void
+}
attributes #0 = { "amdgpu-agpr-alloc"="0" }
;.
|
Also fixes an assertion on out of bound physical register indexes.
ae3f032
to
b917c11
Compare
98879a5
to
0a6415a
Compare
|
||
return false; | ||
unsigned MaxVirtReg = std::max(AGPRUseCount, AGPRDefCount); | ||
return std::min(MaxVirtReg + MaxPhysReg, 256u); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is it intended that this overestimates the AGPR requirement if a physical register and a number of virtual registers that would fit into the register file before the physical register are requested?
For example {a[31]}
and up to 31 a
constraints for virtual registers: This would report 63 registers, but 32 (a0
through a31
) should be enough, right?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The physical registers are unavailable for assignment to the virtual registers, it's supposed to be exact-ish (not sure I have the early clobber logic correct). In that example you need the 31 additional registers on top of whatever physicals are used
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
They conflict within uses and within defs, non-early clobber defs should be usable with vreg inputs
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For this code
define amdgpu_kernel void @foo() {
call void asm sideeffect "; use $0, $1, $2", "{a16},a,a"(i32 17, <8 x i32> splat (i32 1), <16 x i32> splat (i32 2))
ret void
}
we allocate ; use a16, a[18:25], a[0:15]
, so the asm uses 25 AGPRs (arguably 26 since a25
is used and a17
is left out, not sure why it's not allocated as a[17:24]
, I'm not aware of alignment requirements for AGPRs).
This function computes 17 (the highest required physical register index + 1) + 24 (the number of virtual registers required) = 41 AGPRs required.
This over-approximation seems worth pointing out in a comment, if it's intended.
For now just try to compute the minimum number of AGPRs required to allocate the asm. Leave the attributor changes to turn this into an integer value for later.
0a6415a
to
2cef45d
Compare
For now just try to compute the minimum number of AGPRs required
to allocate the asm. Leave the attributor changes to turn this
into an integer value for later.