-
Notifications
You must be signed in to change notification settings - Fork 10.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU] Only try DecoderTables for the current subtarget. NFCI. #82992
Conversation
Speed up disassembly by only calling tryDecodeInst for DecoderTables that make sense for the current subtarget. This gives a 1.3x speed-up on check-llvm-mc-disassembler-amdgpu in my Release+Asserts build.
@llvm/pr-subscribers-backend-amdgpu Author: Jay Foad (jayfoad) ChangesSpeed up disassembly by only calling tryDecodeInst for DecoderTables This gives a 1.3x speed-up on check-llvm-mc-disassembler-amdgpu in my Full diff: https://github.com/llvm/llvm-project/pull/82992.diff 3 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index e1cca17bdbf432..8c42304ce0bee5 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -466,15 +466,18 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
if (isGFX11Plus() && Bytes.size() >= 12 ) {
DecoderUInt128 DecW = eat12Bytes(Bytes);
- if (tryDecodeInst(DecoderTableGFX1196, DecoderTableGFX11_FAKE1696, MI,
+ if (isGFX11() &&
+ tryDecodeInst(DecoderTableGFX1196, DecoderTableGFX11_FAKE1696, MI,
DecW, Address, CS))
break;
- if (tryDecodeInst(DecoderTableGFX1296, DecoderTableGFX12_FAKE1696, MI,
+ if (isGFX12() &&
+ tryDecodeInst(DecoderTableGFX1296, DecoderTableGFX12_FAKE1696, MI,
DecW, Address, CS))
break;
- if (tryDecodeInst(DecoderTableGFX12W6496, MI, DecW, Address, CS))
+ if (isGFX12() &&
+ tryDecodeInst(DecoderTableGFX12W6496, MI, DecW, Address, CS))
break;
}
@@ -507,27 +510,32 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
tryDecodeInst(DecoderTableGFX90A64, MI, QW, Address, CS))
break;
- if (tryDecodeInst(DecoderTableGFX864, MI, QW, Address, CS))
+ if ((isVI() || isGFX9()) &&
+ tryDecodeInst(DecoderTableGFX864, MI, QW, Address, CS))
break;
- if (tryDecodeInst(DecoderTableGFX964, MI, QW, Address, CS))
+ if (isGFX9() && tryDecodeInst(DecoderTableGFX964, MI, QW, Address, CS))
break;
- if (tryDecodeInst(DecoderTableGFX1064, MI, QW, Address, CS))
+ if (isGFX10() && tryDecodeInst(DecoderTableGFX1064, MI, QW, Address, CS))
break;
- if (tryDecodeInst(DecoderTableGFX1264, DecoderTableGFX12_FAKE1664, MI, QW,
+ if (isGFX12() &&
+ tryDecodeInst(DecoderTableGFX1264, DecoderTableGFX12_FAKE1664, MI, QW,
Address, CS))
break;
- if (tryDecodeInst(DecoderTableGFX1164, DecoderTableGFX11_FAKE1664, MI, QW,
+ if (isGFX11() &&
+ tryDecodeInst(DecoderTableGFX1164, DecoderTableGFX11_FAKE1664, MI, QW,
Address, CS))
break;
- if (tryDecodeInst(DecoderTableGFX11W6464, MI, QW, Address, CS))
+ if (isGFX11() &&
+ tryDecodeInst(DecoderTableGFX11W6464, MI, QW, Address, CS))
break;
- if (tryDecodeInst(DecoderTableGFX12W6464, MI, QW, Address, CS))
+ if (isGFX12() &&
+ tryDecodeInst(DecoderTableGFX12W6464, MI, QW, Address, CS))
break;
}
@@ -538,13 +546,14 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
if (Bytes.size() >= 4) {
const uint32_t DW = eatBytes<uint32_t>(Bytes);
- if (tryDecodeInst(DecoderTableGFX832, MI, DW, Address, CS))
+ if ((isVI() || isGFX9()) &&
+ tryDecodeInst(DecoderTableGFX832, MI, DW, Address, CS))
break;
if (tryDecodeInst(DecoderTableAMDGPU32, MI, DW, Address, CS))
break;
- if (tryDecodeInst(DecoderTableGFX932, MI, DW, Address, CS))
+ if (isGFX9() && tryDecodeInst(DecoderTableGFX932, MI, DW, Address, CS))
break;
if (STI.hasFeature(AMDGPU::FeatureGFX90AInsts) &&
@@ -555,14 +564,16 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
tryDecodeInst(DecoderTableGFX10_B32, MI, DW, Address, CS))
break;
- if (tryDecodeInst(DecoderTableGFX1032, MI, DW, Address, CS))
+ if (isGFX10() && tryDecodeInst(DecoderTableGFX1032, MI, DW, Address, CS))
break;
- if (tryDecodeInst(DecoderTableGFX1132, DecoderTableGFX11_FAKE1632, MI, DW,
+ if (isGFX11() &&
+ tryDecodeInst(DecoderTableGFX1132, DecoderTableGFX11_FAKE1632, MI, DW,
Address, CS))
break;
- if (tryDecodeInst(DecoderTableGFX1232, DecoderTableGFX12_FAKE1632, MI, DW,
+ if (isGFX12() &&
+ tryDecodeInst(DecoderTableGFX1232, DecoderTableGFX12_FAKE1632, MI, DW,
Address, CS))
break;
}
@@ -1750,6 +1761,10 @@ bool AMDGPUDisassembler::isGFX11Plus() const {
return AMDGPU::isGFX11Plus(STI);
}
+bool AMDGPUDisassembler::isGFX12() const {
+ return STI.hasFeature(AMDGPU::FeatureGFX12);
+}
+
bool AMDGPUDisassembler::isGFX12Plus() const {
return AMDGPU::isGFX12Plus(STI);
}
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index 2e1b6fb1c740b7..6a4cb120872089 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -275,6 +275,7 @@ class AMDGPUDisassembler : public MCDisassembler {
bool isGFX10Plus() const;
bool isGFX11() const;
bool isGFX11Plus() const;
+ bool isGFX12() const;
bool isGFX12Plus() const;
bool hasArchitectedFlatScratch() const;
diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index fe4db0ebb0262d..cc374fbae7cc56 100644
--- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -527,7 +527,7 @@ multiclass MIMG_NoSampler_Src_Helper <mimgopc op, string asm,
let ssamp = 0 in {
if op.HAS_GFX10M then {
def _V1 : MIMG_NoSampler_Helper <op, asm, dst_rc, VGPR_32,
- !if(enableDisasm, "GFX10", "")>;
+ !if(enableDisasm, "GFX8", "")>;
if !not(ExtendedImageInst) then
def _V1_gfx90a : MIMG_NoSampler_Helper_gfx90a <op, asm, dst_rc, VGPR_32,
!if(enableDisasm, "GFX90A", "")>;
@@ -754,7 +754,7 @@ multiclass MIMG_Store_Addr_Helper <mimgopc op, string asm,
let ssamp = 0 in {
if op.HAS_GFX10M then {
def _V1 : MIMG_Store_Helper <op, asm, data_rc, VGPR_32,
- !if(enableDisasm, "GFX10", "")>;
+ !if(enableDisasm, "GFX8", "")>;
let hasPostISelHook = 1 in
def _V1_gfx90a : MIMG_Store_Helper_gfx90a <op, asm, data_rc, VGPR_32,
!if(enableDisasm, "GFX90A", "")>;
@@ -1298,7 +1298,7 @@ multiclass MIMG_Sampler_Src_Helper <mimgopc op, string asm,
if op.HAS_GFX10M then {
def _V # addr.NumWords
: MIMG_Sampler_Helper <op, asm, dst_rc, addr.RegClass,
- !if(!and(enableDisasm, addr.Disassemble), "GFX10", "")>;
+ !if(!and(enableDisasm, addr.Disassemble), "GFX8", "")>;
if !not(ExtendedImageInst) then
def _V # addr.NumWords # _gfx90a
: MIMG_Sampler_gfx90a <op, asm, dst_rc, addr.RegClass,
|
@@ -507,27 +510,32 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, | |||
tryDecodeInst(DecoderTableGFX90A64, MI, QW, Address, CS)) | |||
break; | |||
|
|||
if (tryDecodeInst(DecoderTableGFX864, MI, QW, Address, CS)) | |||
if ((isVI() || isGFX9()) && |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Historical wart: namespace "GFX8" is used for lots of instructions that are shared with GFX9. Perhaps it should be renamed to "GFX8GFX9" or split into a shared part and a GFX8-only part. But I didn't want to touch that in this patch.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it would be better to have a common table for all sub targets with a shared encoding. That looks more like normal cpu targets, where you can emit instructions that will just fault on unsupported CPUs. Really we probably should be using different top level targets for each major encoding change
@@ -527,7 +527,7 @@ multiclass MIMG_NoSampler_Src_Helper <mimgopc op, string asm, | |||
let ssamp = 0 in { | |||
if op.HAS_GFX10M then { | |||
def _V1 : MIMG_NoSampler_Helper <op, asm, dst_rc, VGPR_32, | |||
!if(enableDisasm, "GFX10", "")>; | |||
!if(enableDisasm, "GFX8", "")>; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is fixing a mistake that was previously harmless because we tried all namespaces on all subtargets.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM. We would not need to do the checks manually if the disassembler TableGen backend a) tests predicates as early as possible and b) allows differently predicated / mutually exclusive instructions in the same decoding table. This would also greatly reduce the size of our decoding tables, which currently contain a predicate node at the end of every single decoding path.
If |
Yes, that's the point, wouldn't need multiple decoding attempts at all. I can imagine |
Speed up disassembly by only calling tryDecodeInst for DecoderTables
that make sense for the current subtarget.
This gives a 1.3x speed-up on check-llvm-mc-disassembler-amdgpu in my
Release+Asserts build.