-
Notifications
You must be signed in to change notification settings - Fork 10.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU] New ttracedata intrinsics #70235
Conversation
Add llvm.amdgcn.s.ttracedata and llvm.amdgcn.s.ttracedata.imm which map directly to the corresponding instructions s_ttracedata and s_ttracedata_imm. These are inherently whole-wave operations so any non-uniform inputs are readfirstlaned.
@llvm/pr-subscribers-llvm-ir @llvm/pr-subscribers-backend-amdgpu Author: Jay Foad (jayfoad) ChangesAdd llvm.amdgcn.s.ttracedata and llvm.amdgcn.s.ttracedata.imm which map Full diff: https://github.com/llvm/llvm-project/pull/70235.diff 4 Files Affected:
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 5f1d1d932f74cbd..a3acfccd00f8e16 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1697,6 +1697,13 @@ def int_amdgcn_s_setprio :
DefaultAttrsIntrinsic<[], [llvm_i16_ty], [ImmArg<ArgIndex<0>>, IntrNoMem,
IntrHasSideEffects]>;
+def int_amdgcn_s_ttracedata :
+ DefaultAttrsIntrinsic<[], [llvm_i32_ty],
+ [IntrNoMem, IntrHasSideEffects]>;
+def int_amdgcn_s_ttracedata_imm :
+ DefaultAttrsIntrinsic<[], [llvm_i16_ty],
+ [IntrNoMem, IntrHasSideEffects, ImmArg<ArgIndex<0>>]>;
+
// This is IntrHasSideEffects so it can be used to read cycle counters.
def int_amdgcn_s_getreg :
ClangBuiltin<"__builtin_amdgcn_s_getreg">,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 5b056bd9e5dba2c..f117f732cb84ffb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -3064,6 +3064,9 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
constrainOpWithReadfirstlane(B, MI, 2);
return;
}
+ case Intrinsic::amdgcn_s_ttracedata:
+ constrainOpWithReadfirstlane(B, MI, 1); // M0
+ return;
case Intrinsic::amdgcn_raw_buffer_load_lds:
case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: {
applyDefaultMapping(OpdMapper);
@@ -4653,6 +4656,13 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
break;
}
+ case Intrinsic::amdgcn_s_ttracedata: {
+ // This must be an SGPR, but accept a VGPR.
+ unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI,
+ AMDGPU::SGPRRegBankID);
+ OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
+ break;
+ }
case Intrinsic::amdgcn_end_cf: {
unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 2f3b0ff2f76215e..0ec4f8150bfcc06 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1500,7 +1500,10 @@ def S_INCPERFLEVEL : SOPP_Pseudo <"s_incperflevel", (ins i32imm:$simm16), "$simm
def S_DECPERFLEVEL : SOPP_Pseudo <"s_decperflevel", (ins i32imm:$simm16), "$simm16",
[(int_amdgcn_s_decperflevel timm:$simm16)]> {
}
-def S_TTRACEDATA : SOPP_Pseudo <"s_ttracedata", (ins)> {
+
+let Uses = [M0] in
+def S_TTRACEDATA : SOPP_Pseudo <"s_ttracedata", (ins), "",
+ [(int_amdgcn_s_ttracedata M0)]> {
let simm16 = 0;
let fixed_imm = 1;
}
@@ -1544,8 +1547,10 @@ let SubtargetPredicate = isGFX10Plus in {
[(SIdenorm_mode (i32 timm:$simm16))]>;
}
+ let hasSideEffects = 1 in
def S_TTRACEDATA_IMM :
- SOPP_Pseudo<"s_ttracedata_imm", (ins s16imm:$simm16), "$simm16">;
+ SOPP_Pseudo<"s_ttracedata_imm", (ins s16imm:$simm16), "$simm16",
+ [(int_amdgcn_s_ttracedata_imm timm:$simm16)]>;
} // End SubtargetPredicate = isGFX10Plus
let SubtargetPredicate = isGFX11Plus in {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.ttracedata.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.ttracedata.ll
new file mode 100644
index 000000000000000..37b5357950e648b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.ttracedata.ll
@@ -0,0 +1,53 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG %s
+; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s
+
+declare void @llvm.amdgcn.s.ttracedata(i32)
+declare void @llvm.amdgcn.s.ttracedata.imm(i16)
+
+define amdgpu_cs void @ttracedata_c() {
+; GFX11-LABEL: ttracedata_c:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_mov_b32 m0, 0xf4240
+; GFX11-NEXT: s_ttracedata
+; GFX11-NEXT: s_endpgm
+ call void @llvm.amdgcn.s.ttracedata(i32 1000000)
+ ret void
+}
+
+define amdgpu_cs void @ttracedata_s(i32 inreg %val) {
+; GFX11-LABEL: ttracedata_s:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_mov_b32 m0, s0
+; GFX11-NEXT: s_ttracedata
+; GFX11-NEXT: s_endpgm
+ call void @llvm.amdgcn.s.ttracedata(i32 %val)
+ ret void
+}
+
+define amdgpu_cs void @ttracedata_v(i32 %val) {
+; GFX11-SDAG-LABEL: ttracedata_v:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: s_mov_b32 m0, s0
+; GFX11-SDAG-NEXT: s_ttracedata
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: ttracedata_v:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 m0, v0
+; GFX11-GISEL-NEXT: s_ttracedata
+; GFX11-GISEL-NEXT: s_endpgm
+ call void @llvm.amdgcn.s.ttracedata(i32 %val)
+ ret void
+}
+
+define amdgpu_cs void @ttracedata_imm() {
+; GFX11-LABEL: ttracedata_imm:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_ttracedata_imm 0x3e8
+; GFX11-NEXT: s_endpgm
+ call void @llvm.amdgcn.s.ttracedata.imm(i16 1000)
+ ret void
+}
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
def int_amdgcn_s_ttracedata : | ||
DefaultAttrsIntrinsic<[], [llvm_i32_ty], | ||
[IntrNoMem, IntrHasSideEffects]>; | ||
def int_amdgcn_s_ttracedata_imm : |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we really need the imm form, or can the backend just fold the register version with appropriate constant inputs to the imm instruction?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We really need it. They do different things. s_ttracedata emits a 32-bit "token" into ttracedata. s_ttracedata_imm emits an 8-bit "token" (the low 8 bits of the imm16 operand).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is in no way obvious from the manual descriptions
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, I think you would have to read the instruction descriptions in conjunction with the separate threadtrace documentation, and most or all of that is not public anyway.
Ping! |
Add llvm.amdgcn.s.ttracedata and llvm.amdgcn.s.ttracedata.imm which map
directly to the corresponding instructions s_ttracedata and
s_ttracedata_imm. These are inherently whole-wave operations so any
non-uniform inputs are readfirstlaned.